require(data.table)
require(lubridate)
require(zoo)
require(forecast)
data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
data <- fread(data_path)
#TRENDYOLMILLA TAYT
##Introduction and Model Construction
After filtering our whole data for Trendmilla Tayt,we plot graph for amount that has been sold.
maproducts <- data[data$product_content_id == 31515569]
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()
That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.
maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]
maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()
Variance of log of sold products seems more stable.Therefore,we start building linear regression models with logarithmic data.
lm_model <- lm(maproducts, formula = log_sold ~ month + day +
price + visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23155 -0.38023 -0.02241 0.31263 2.61907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.518e+00 3.982e-01 18.879 < 2e-16 ***
## month 6.895e-02 1.336e-02 5.160 4.06e-07 ***
## day -5.625e-03 1.838e-02 -0.306 0.76
## price -3.533e-02 7.184e-03 -4.918 1.32e-06 ***
## visit_count 1.660e-05 2.339e-06 7.096 6.66e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7084 on 367 degrees of freedom
## Multiple R-squared: 0.1824, Adjusted R-squared: 0.1734
## F-statistic: 20.46 on 4 and 367 DF, p-value: 3.11e-15
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24229 -0.37555 -0.02929 0.31191 2.63026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.491e+00 3.877e-01 19.320 < 2e-16 ***
## month 6.904e-02 1.334e-02 5.175 3.75e-07 ***
## price -3.525e-02 7.170e-03 -4.917 1.33e-06 ***
## visit_count 1.661e-05 2.336e-06 7.109 6.13e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7075 on 368 degrees of freedom
## Multiple R-squared: 0.1821, Adjusted R-squared: 0.1755
## F-statistic: 27.32 on 3 and 368 DF, p-value: 5.666e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.53430 -0.43260 -0.03911 0.31180 2.38794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.209e+01 6.651e-01 18.173 < 2e-16 ***
## month 5.710e-02 1.237e-02 4.616 5.42e-06 ***
## price -1.052e-01 1.079e-02 -9.746 < 2e-16 ***
## visit_count 6.942e-06 2.453e-06 2.830 0.00492 **
## trend -5.281e-03 6.447e-04 -8.191 4.36e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6514 on 367 degrees of freedom
## Multiple R-squared: 0.3085, Adjusted R-squared: 0.301
## F-statistic: 40.94 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62935 -0.43413 -0.03755 0.32096 2.52811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.117e+01 6.648e-01 16.809 < 2e-16 ***
## month 5.724e-02 1.194e-02 4.794 2.38e-06 ***
## price -9.338e-02 1.065e-02 -8.767 < 2e-16 ***
## visit_count 3.305e-07 2.678e-06 0.123 0.902
## trend -4.101e-03 6.611e-04 -6.203 1.49e-09 ***
## favored_count 1.511e-04 2.860e-05 5.285 2.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6288 on 366 degrees of freedom
## Multiple R-squared: 0.3576, Adjusted R-squared: 0.3488
## F-statistic: 40.74 on 5 and 366 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.55356 -0.22639 0.04185 0.28827 0.96790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.945e+00 5.009e-01 15.861 < 2e-16 ***
## month 1.648e-02 8.732e-03 1.887 0.05993 .
## price -4.214e-02 8.018e-03 -5.256 2.51e-07 ***
## visit_count -8.494e-07 1.900e-06 -0.447 0.65508
## trend -9.410e-04 4.972e-04 -1.892 0.05923 .
## favored_count 6.540e-05 2.077e-05 3.149 0.00177 **
## category_sold 2.748e-04 1.442e-05 19.052 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4458 on 365 degrees of freedom
## Multiple R-squared: 0.6779, Adjusted R-squared: 0.6726
## F-statistic: 128 on 6 and 365 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.46608 -0.33154 -0.05262 0.34463 2.75039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.650e+00 3.624e-01 21.112 < 2e-16 ***
## month 6.551e-02 1.246e-02 5.259 2.46e-07 ***
## price -4.060e-02 6.728e-03 -6.035 3.90e-09 ***
## visit_count 4.361e-06 2.727e-06 1.599 0.111
## favored_count 2.110e-04 2.826e-05 7.468 6.00e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6601 on 367 degrees of freedom
## Multiple R-squared: 0.29, Adjusted R-squared: 0.2823
## F-statistic: 37.48 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.63705 -0.43319 -0.03951 0.32334 2.52880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.119e+01 6.494e-01 17.235 < 2e-16 ***
## month 5.668e-02 1.104e-02 5.134 4.62e-07 ***
## price -9.355e-02 1.055e-02 -8.868 < 2e-16 ***
## trend -4.121e-03 6.405e-04 -6.434 3.89e-10 ***
## favored_count 1.528e-04 2.525e-05 6.051 3.56e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6279 on 367 degrees of freedom
## Multiple R-squared: 0.3576, Adjusted R-squared: 0.3505
## F-statistic: 51.06 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + category_brand_sold +
as.factor(day))
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + category_brand_sold + as.factor(day),
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.41717 -0.23441 0.00487 0.29603 0.92353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.550e+00 4.879e-01 15.475 < 2e-16 ***
## month 3.861e-02 9.035e-03 4.273 2.48e-05 ***
## price -5.041e-02 7.778e-03 -6.481 3.02e-10 ***
## visit_count 5.051e-07 1.831e-06 0.276 0.7828
## trend 7.059e-04 5.474e-04 1.290 0.1980
## favored_count -6.297e-05 2.857e-05 -2.204 0.0281 *
## category_sold 2.939e-04 1.433e-05 20.506 < 2e-16 ***
## category_brand_sold 1.104e-05 1.752e-06 6.304 8.53e-10 ***
## as.factor(day)2 5.223e-02 8.247e-02 0.633 0.5270
## as.factor(day)3 1.648e-01 8.284e-02 1.989 0.0475 *
## as.factor(day)4 1.036e-01 8.296e-02 1.248 0.2127
## as.factor(day)5 9.351e-02 8.314e-02 1.125 0.2615
## as.factor(day)6 5.406e-02 8.289e-02 0.652 0.5147
## as.factor(day)7 1.303e-02 8.250e-02 0.158 0.8746
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4242 on 358 degrees of freedom
## Multiple R-squared: 0.714, Adjusted R-squared: 0.7036
## F-statistic: 68.73 on 13 and 358 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + category_brand_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42445 -0.24290 0.00981 0.28835 0.95183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.581e+00 4.799e-01 15.796 < 2e-16 ***
## month 3.852e-02 9.013e-03 4.274 2.46e-05 ***
## price -4.991e-02 7.724e-03 -6.461 3.34e-10 ***
## visit_count 7.677e-07 1.825e-06 0.421 0.6742
## trend 7.495e-04 5.439e-04 1.378 0.1690
## favored_count -6.286e-05 2.838e-05 -2.214 0.0274 *
## category_sold 2.974e-04 1.418e-05 20.974 < 2e-16 ***
## category_brand_sold 1.094e-05 1.738e-06 6.292 9.00e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.424 on 364 degrees of freedom
## Multiple R-squared: 0.7095, Adjusted R-squared: 0.7039
## F-statistic: 127 on 7 and 364 DF, p-value: < 2.2e-16
After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘log_sold ~ month + price + visit_count + trend + favored_count + category_sold + category_brand_sold’. Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.
maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + lag_1)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + lag_1, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78466 -0.17543 0.02087 0.18797 0.98322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.100e+00 4.621e-01 8.872 < 2e-16 ***
## month 7.673e-03 6.793e-03 1.130 0.25942
## price -2.240e-02 6.348e-03 -3.529 0.00047 ***
## visit_count 3.753e-07 1.476e-06 0.254 0.79941
## trend -5.040e-04 3.866e-04 -1.304 0.19309
## favored_count 1.433e-05 1.643e-05 0.872 0.38361
## category_sold 1.753e-04 1.368e-05 12.819 < 2e-16 ***
## lag_1 4.837e-01 3.277e-02 14.761 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3456 on 363 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8074, Adjusted R-squared: 0.8037
## F-statistic: 217.4 on 7 and 363 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + category_sold + lag_1)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## category_sold + lag_1, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.77848 -0.17684 0.02261 0.18454 0.98184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.111e+00 4.618e-01 8.903 < 2e-16 ***
## month 7.256e-03 6.774e-03 1.071 0.284761
## price -2.282e-02 6.327e-03 -3.607 0.000352 ***
## visit_count 9.547e-07 1.318e-06 0.725 0.469206
## trend -5.779e-04 3.771e-04 -1.533 0.126229
## category_sold 1.762e-04 1.363e-05 12.925 < 2e-16 ***
## lag_1 4.892e-01 3.213e-02 15.224 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3455 on 364 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.807, Adjusted R-squared: 0.8039
## F-statistic: 253.7 on 6 and 364 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold +
## lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78302 -0.17611 0.01115 0.18232 0.90548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.255e+00 4.322e-01 9.844 < 2e-16 ***
## month 7.376e-03 5.836e-03 1.264 0.207059
## price -2.129e-02 6.003e-03 -3.546 0.000442 ***
## trend -6.655e-04 3.378e-04 -1.970 0.049589 *
## category_sold 1.809e-04 1.365e-05 13.248 < 2e-16 ***
## lag_1 6.085e-01 4.862e-02 12.516 < 2e-16 ***
## lag_2 -1.503e-01 4.040e-02 -3.720 0.000230 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.331 on 363 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.823, Adjusted R-squared: 0.8201
## F-statistic: 281.3 on 6 and 363 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + category_sold + lag_1 + lag_2 + as.factor(day))
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold +
## lag_1 + lag_2 + as.factor(day), data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78906 -0.16743 0.01298 0.18226 0.92104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.052e+00 4.420e-01 9.168 < 2e-16 ***
## month 7.346e-03 5.798e-03 1.267 0.205974
## price -1.985e-02 6.010e-03 -3.303 0.001053 **
## trend -6.125e-04 3.375e-04 -1.815 0.070395 .
## category_sold 1.743e-04 1.374e-05 12.689 < 2e-16 ***
## lag_1 6.255e-01 4.965e-02 12.598 < 2e-16 ***
## lag_2 -1.481e-01 4.155e-02 -3.564 0.000414 ***
## as.factor(day)2 -8.424e-02 6.466e-02 -1.303 0.193527
## as.factor(day)3 3.685e-02 6.487e-02 0.568 0.570368
## as.factor(day)4 -8.666e-03 6.673e-02 -0.130 0.896750
## as.factor(day)5 1.244e-01 6.586e-02 1.889 0.059664 .
## as.factor(day)6 7.586e-03 6.496e-02 0.117 0.907112
## as.factor(day)7 2.645e-03 6.449e-02 0.041 0.967309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3288 on 357 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8282, Adjusted R-squared: 0.8225
## F-statistic: 143.4 on 12 and 357 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold +
## lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78302 -0.17611 0.01115 0.18232 0.90548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.255e+00 4.322e-01 9.844 < 2e-16 ***
## month 7.376e-03 5.836e-03 1.264 0.207059
## price -2.129e-02 6.003e-03 -3.546 0.000442 ***
## trend -6.655e-04 3.378e-04 -1.970 0.049589 *
## category_sold 1.809e-04 1.365e-05 13.248 < 2e-16 ***
## lag_1 6.085e-01 4.862e-02 12.516 < 2e-16 ***
## lag_2 -1.503e-01 4.040e-02 -3.720 0.000230 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.331 on 363 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.823, Adjusted R-squared: 0.8201
## F-statistic: 281.3 on 6 and 363 DF, p-value: < 2.2e-16
That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ month + price + trend + category_sold + lag_1 + lag_2’.
As another improving step, we could take month variable as factor variable.
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + visit_count + trend + favored_count + category_sold +
lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + visit_count +
## trend + favored_count + category_sold + lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78360 -0.14550 -0.00289 0.14253 1.08028
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.071e+00 5.701e-01 8.894 < 2e-16 ***
## as.factor(month)2 -1.359e-02 1.143e-01 -0.119 0.90545
## as.factor(month)3 3.768e-02 1.174e-01 0.321 0.74843
## as.factor(month)4 1.321e-01 1.217e-01 1.085 0.27859
## as.factor(month)5 1.118e-02 1.077e-01 0.104 0.91737
## as.factor(month)6 -2.378e-02 1.319e-01 -0.180 0.85707
## as.factor(month)7 -3.747e-01 1.216e-01 -3.082 0.00222 **
## as.factor(month)8 -3.626e-01 1.137e-01 -3.191 0.00155 **
## as.factor(month)9 -8.878e-02 1.060e-01 -0.838 0.40268
## as.factor(month)10 9.986e-02 1.042e-01 0.958 0.33874
## as.factor(month)11 -6.428e-02 1.081e-01 -0.595 0.55235
## as.factor(month)12 1.721e-01 8.356e-02 2.060 0.04017 *
## price -2.749e-02 9.373e-03 -2.933 0.00358 **
## visit_count 1.091e-06 2.300e-06 0.474 0.63555
## trend 9.213e-05 5.899e-04 0.156 0.87599
## favored_count -5.311e-06 2.455e-05 -0.216 0.82886
## category_sold 1.902e-04 1.395e-05 13.635 < 2e-16 ***
## lag_1 5.437e-01 4.717e-02 11.526 < 2e-16 ***
## lag_2 -1.777e-01 3.977e-02 -4.469 1.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3142 on 351 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8458, Adjusted R-squared: 0.8379
## F-statistic: 106.9 on 18 and 351 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + visit_count + trend + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + visit_count +
## trend + category_sold + lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78042 -0.14673 -0.00239 0.14477 1.08025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.065e+00 5.687e-01 8.907 < 2e-16 ***
## as.factor(month)2 -2.796e-03 1.027e-01 -0.027 0.978298
## as.factor(month)3 4.539e-02 1.117e-01 0.406 0.684722
## as.factor(month)4 1.409e-01 1.146e-01 1.229 0.219821
## as.factor(month)5 2.125e-02 9.701e-02 0.219 0.826778
## as.factor(month)6 -1.411e-02 1.240e-01 -0.114 0.909446
## as.factor(month)7 -3.659e-01 1.143e-01 -3.200 0.001499 **
## as.factor(month)8 -3.533e-01 1.051e-01 -3.363 0.000855 ***
## as.factor(month)9 -7.815e-02 9.376e-02 -0.834 0.405104
## as.factor(month)10 1.122e-01 8.716e-02 1.287 0.198880
## as.factor(month)11 -5.082e-02 8.824e-02 -0.576 0.565023
## as.factor(month)12 1.681e-01 8.140e-02 2.065 0.039611 *
## price -2.740e-02 9.352e-03 -2.930 0.003608 **
## visit_count 7.825e-07 1.802e-06 0.434 0.664389
## trend 1.007e-04 5.878e-04 0.171 0.864061
## category_sold 1.898e-04 1.377e-05 13.778 < 2e-16 ***
## lag_1 5.431e-01 4.703e-02 11.548 < 2e-16 ***
## lag_2 -1.787e-01 3.945e-02 -4.530 8.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3138 on 352 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8457, Adjusted R-squared: 0.8383
## F-statistic: 113.5 on 17 and 352 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + category_sold +
## lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78021 -0.14134 -0.00437 0.14636 1.07965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.123e+00 5.522e-01 9.277 < 2e-16 ***
## as.factor(month)2 2.237e-02 8.472e-02 0.264 0.79188
## as.factor(month)3 7.785e-02 8.289e-02 0.939 0.34828
## as.factor(month)4 1.646e-01 1.007e-01 1.635 0.10289
## as.factor(month)5 3.557e-02 9.113e-02 0.390 0.69657
## as.factor(month)6 -1.186e-03 1.202e-01 -0.010 0.99213
## as.factor(month)7 -3.552e-01 1.116e-01 -3.184 0.00158 **
## as.factor(month)8 -3.447e-01 1.030e-01 -3.345 0.00091 ***
## as.factor(month)9 -7.363e-02 9.307e-02 -0.791 0.42943
## as.factor(month)10 1.119e-01 8.706e-02 1.285 0.19954
## as.factor(month)11 -5.567e-02 8.742e-02 -0.637 0.52467
## as.factor(month)12 1.670e-01 8.126e-02 2.055 0.04064 *
## price -2.856e-02 8.954e-03 -3.190 0.00155 **
## trend 1.207e-05 5.506e-04 0.022 0.98252
## category_sold 1.905e-04 1.364e-05 13.968 < 2e-16 ***
## lag_1 5.438e-01 4.695e-02 11.582 < 2e-16 ***
## lag_2 -1.775e-01 3.931e-02 -4.516 8.6e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3135 on 353 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8457, Adjusted R-squared: 0.8387
## F-statistic: 120.9 on 16 and 353 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold +
## lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.78012 -0.14141 -0.00457 0.14608 1.07969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.1311195 0.4000478 12.826 < 2e-16 ***
## as.factor(month)2 0.0218987 0.0818169 0.268 0.789121
## as.factor(month)3 0.0776407 0.0822366 0.944 0.345755
## as.factor(month)4 0.1649441 0.0992943 1.661 0.097566 .
## as.factor(month)5 0.0360292 0.0885224 0.407 0.684249
## as.factor(month)6 0.0006196 0.0874290 0.007 0.994350
## as.factor(month)7 -0.3539073 0.0931808 -3.798 0.000172 ***
## as.factor(month)8 -0.3436076 0.0896331 -3.833 0.000149 ***
## as.factor(month)9 -0.0729030 0.0868755 -0.839 0.401943
## as.factor(month)10 0.1122532 0.0853731 1.315 0.189410
## as.factor(month)11 -0.0555272 0.0870522 -0.638 0.523977
## as.factor(month)12 0.1672511 0.0801523 2.087 0.037633 *
## price -0.0287052 0.0061096 -4.698 3.76e-06 ***
## category_sold 0.0001905 0.0000129 14.762 < 2e-16 ***
## lag_1 0.5437774 0.0468828 11.599 < 2e-16 ***
## lag_2 -0.1774352 0.0390363 -4.545 7.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.313 on 354 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8457, Adjusted R-squared: 0.8391
## F-statistic: 129.3 on 15 and 354 DF, p-value: < 2.2e-16
That is quite developing iteration,too.From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.
maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals,
0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals,
0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + category_sold + lag_1 + lag_2 + outlier_small + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold +
## lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.75665 -0.14145 -0.00064 0.14662 0.87929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.118e+00 3.980e-01 12.861 < 2e-16 ***
## as.factor(month)2 4.164e-02 8.071e-02 0.516 0.606239
## as.factor(month)3 6.394e-02 8.119e-02 0.788 0.431461
## as.factor(month)4 1.748e-01 9.829e-02 1.778 0.076221 .
## as.factor(month)5 4.582e-02 8.796e-02 0.521 0.602790
## as.factor(month)6 1.601e-02 8.624e-02 0.186 0.852845
## as.factor(month)7 -3.603e-01 9.168e-02 -3.930 0.000102 ***
## as.factor(month)8 -3.596e-01 8.835e-02 -4.070 5.81e-05 ***
## as.factor(month)9 -7.967e-02 8.543e-02 -0.933 0.351684
## as.factor(month)10 1.353e-01 8.433e-02 1.605 0.109405
## as.factor(month)11 -6.754e-02 8.566e-02 -0.788 0.430964
## as.factor(month)12 1.859e-01 7.914e-02 2.349 0.019354 *
## price -2.798e-02 6.162e-03 -4.540 7.73e-06 ***
## category_sold 1.946e-04 1.277e-05 15.244 < 2e-16 ***
## lag_1 5.203e-01 4.660e-02 11.166 < 2e-16 ***
## lag_2 -1.618e-01 3.864e-02 -4.187 3.58e-05 ***
## outlier_small 2.189e-02 7.986e-02 0.274 0.784130
## outlier_great 2.842e-01 7.535e-02 3.772 0.000190 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3077 on 352 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8517, Adjusted R-squared: 0.8445
## F-statistic: 118.9 on 17 and 352 DF, p-value: < 2.2e-16
lm_model2_1 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + category_sold + lag_1 + lag_2 + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold +
## lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.75665 -0.14145 -0.00064 0.14662 0.87929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.118e+00 3.980e-01 12.861 < 2e-16 ***
## as.factor(month)2 4.164e-02 8.071e-02 0.516 0.606239
## as.factor(month)3 6.394e-02 8.119e-02 0.788 0.431461
## as.factor(month)4 1.748e-01 9.829e-02 1.778 0.076221 .
## as.factor(month)5 4.582e-02 8.796e-02 0.521 0.602790
## as.factor(month)6 1.601e-02 8.624e-02 0.186 0.852845
## as.factor(month)7 -3.603e-01 9.168e-02 -3.930 0.000102 ***
## as.factor(month)8 -3.596e-01 8.835e-02 -4.070 5.81e-05 ***
## as.factor(month)9 -7.967e-02 8.543e-02 -0.933 0.351684
## as.factor(month)10 1.353e-01 8.433e-02 1.605 0.109405
## as.factor(month)11 -6.754e-02 8.566e-02 -0.788 0.430964
## as.factor(month)12 1.859e-01 7.914e-02 2.349 0.019354 *
## price -2.798e-02 6.162e-03 -4.540 7.73e-06 ***
## category_sold 1.946e-04 1.277e-05 15.244 < 2e-16 ***
## lag_1 5.203e-01 4.660e-02 11.166 < 2e-16 ***
## lag_2 -1.618e-01 3.864e-02 -4.187 3.58e-05 ***
## outlier_small 2.189e-02 7.986e-02 0.274 0.784130
## outlier_great 2.842e-01 7.535e-02 3.772 0.000190 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3077 on 352 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8517, Adjusted R-squared: 0.8445
## F-statistic: 118.9 on 17 and 352 DF, p-value: < 2.2e-16
As we expected, outliers help our model to predict better. For another perspective AIC values of models.
require(ursa)
## Loading required package: ursa
AIC(lm_model)
## [1] 208.1339
AIC(lm_model2_1)
## [1] 195.5486
That proves claim of model with month as factor variable.
Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.
ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)
Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data
sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)
Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.
random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)
pacf(sold_decomp$random, na.action = na.pass)
As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.
ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.5499 -0.3993 -0.0006
## s.e. 0.0485 0.0487 0.0189
##
## sigma^2 estimated as 0.09456: log likelihood = -87.97, aic = 183.94
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -3.906212e-05 0.3074988 0.2143303 108.6211 244.7187 0.8054923
## ACF1
## Training set -0.04720534
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.5011 -0.3299 -0.1256 -0.0005
## s.e. 0.0523 0.0564 0.0526 0.0167
##
## sigma^2 estimated as 0.09309: log likelihood = -85.14, aic = 180.29
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0001426 0.3051121 0.2120258 94.60449 227.2879 0.7968318
## ACF1
## Training set -0.02241973
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.4775 -0.3933 -0.0274 -0.1959 -0.0004
## s.e. 0.0517 0.0579 0.0578 0.0520 0.0138
##
## sigma^2 estimated as 0.08958: log likelihood = -78.18, aic = 168.37
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0003721817 0.2993006 0.2059158 64.52858 206.3382 0.7738692
## ACF1
## Training set -0.04478536
ar_model <- arima(random, order = c(5, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(5, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 intercept
## 0.4301 -0.3984 -0.1215 -0.0797 -0.2430 -0.0003
## s.e. 0.0511 0.0561 0.0595 0.0561 0.0513 0.0108
##
## sigma^2 estimated as 0.08435: log likelihood = -67.33, aic = 148.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0005495861 0.2904369 0.2020511 84.01556 210.5383 0.7593449
## ACF1
## Training set -0.0234773
ar_model <- arima(random, order = c(6, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(6, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.4051 -0.4071 -0.1336 -0.1210 -0.1977 -0.1049 -0.0003
## s.e. 0.0524 0.0560 0.0595 0.0595 0.0559 0.0526 0.0097
##
## sigma^2 estimated as 0.08343: log likelihood = -65.35, aic = 146.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0005821933 0.2888445 0.202161 62.75557 213.4222 0.7597579
## ACF1
## Training set -0.01297692
ar_model <- arima(random, order = c(7, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(7, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 intercept
## 0.3910 -0.4344 -0.1508 -0.1388 -0.2536 -0.0485 -0.1392 -0.0003
## s.e. 0.0521 0.0564 0.0593 0.0593 0.0592 0.0562 0.0523 0.0085
##
## sigma^2 estimated as 0.08182: log likelihood = -61.85, aic = 141.69
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0006207775 0.2860402 0.2004398 61.87008 213.06 0.7532892
## ACF1
## Training set -0.01228718
ar_model <- arima(random, order = c(8, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(8, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## 0.3770 -0.4387 -0.1760 -0.1530 -0.2680 -0.0911 -0.1002 -0.0995
## s.e. 0.0524 0.0562 0.0605 0.0595 0.0594 0.0603 0.0560 0.0525
## intercept
## -0.0003
## s.e. 0.0077
##
## sigma^2 estimated as 0.08101: log likelihood = -60.06, aic = 140.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0006786974 0.2846185 0.1999935 56.32226 211.7121 0.7516121
## ACF1
## Training set -0.006338126
ar_model9 <- arima(random, order = c(9, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(8, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## 0.3770 -0.4387 -0.1760 -0.1530 -0.2680 -0.0911 -0.1002 -0.0995
## s.e. 0.0524 0.0562 0.0605 0.0595 0.0594 0.0603 0.0560 0.0525
## intercept
## -0.0003
## s.e. 0.0077
##
## sigma^2 estimated as 0.08101: log likelihood = -60.06, aic = 140.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0006786974 0.2846185 0.1999935 56.32226 211.7121 0.7516121
## ACF1
## Training set -0.006338126
ar_model <- arima(random, order = c(10, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(10, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## 0.3618 -0.4558 -0.1944 -0.1844 -0.3076 -0.1222 -0.1516 -0.1139
## s.e. 0.0524 0.0562 0.0605 0.0609 0.0613 0.0613 0.0607 0.0606
## ar9 ar10 intercept
## -0.0409 -0.0997 -0.0002
## s.e. 0.0567 0.0535 0.0065
##
## sigma^2 estimated as 0.07972: log likelihood = -57.21, aic = 138.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0008592798 0.2823551 0.1980518 34.04849 224.5072 0.7443149
## ACF1
## Training set -0.0178424
auto.arima(random, seasonal = FALSE, trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2) with non-zero mean : 91.79447
## ARIMA(0,0,0) with non-zero mean : 301.3164
## ARIMA(1,0,0) with non-zero mean : 244.3749
## ARIMA(0,0,1) with non-zero mean : 209.4558
## ARIMA(0,0,0) with zero mean : 299.2956
## ARIMA(1,0,2) with non-zero mean : Inf
## ARIMA(2,0,1) with non-zero mean : 92.18419
## ARIMA(3,0,2) with non-zero mean : Inf
## ARIMA(2,0,3) with non-zero mean : 75.22619
## ARIMA(1,0,3) with non-zero mean : Inf
## ARIMA(3,0,3) with non-zero mean : Inf
## ARIMA(2,0,4) with non-zero mean : 64.74229
## ARIMA(1,0,4) with non-zero mean : Inf
## ARIMA(3,0,4) with non-zero mean : Inf
## ARIMA(2,0,5) with non-zero mean : 66.84559
## ARIMA(1,0,5) with non-zero mean : Inf
## ARIMA(3,0,5) with non-zero mean : Inf
## ARIMA(2,0,4) with zero mean : 62.74181
## ARIMA(1,0,4) with zero mean : 108.0237
## ARIMA(2,0,3) with zero mean : 73.24936
## ARIMA(3,0,4) with zero mean : Inf
## ARIMA(2,0,5) with zero mean : 64.78566
## ARIMA(1,0,3) with zero mean : 107.0578
## ARIMA(1,0,5) with zero mean : 109.7789
## ARIMA(3,0,3) with zero mean : Inf
## ARIMA(3,0,5) with zero mean : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(2,0,4) with zero mean : Inf
## ARIMA(2,0,4) with non-zero mean : Inf
## ARIMA(2,0,5) with zero mean : Inf
## ARIMA(2,0,5) with non-zero mean : Inf
## ARIMA(2,0,3) with zero mean : Inf
## ARIMA(2,0,3) with non-zero mean : Inf
## ARIMA(2,0,2) with non-zero mean : Inf
## ARIMA(2,0,1) with non-zero mean : Inf
## ARIMA(1,0,3) with zero mean : Inf
## ARIMA(1,0,4) with zero mean : Inf
## ARIMA(1,0,5) with zero mean : Inf
## ARIMA(0,0,1) with non-zero mean : 209.7616
##
## Best model: ARIMA(0,0,1) with non-zero mean
## Series: random
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.5216 -0.0007
## s.e. 0.0396 0.0254
##
## sigma^2 estimated as 0.1026: log likelihood=-101.85
## AIC=209.7 AICc=209.76 BIC=221.4
fitted_model <- arima(random, order = c(9, 0, 0))
Best model we have above is the model with (p,d,q)=(9,0,0).
To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.
train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")
test_dates = seq(test_start, test_end, by = "day")
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
fitted_lm = lm(as.formula(fmla), data)
forecasted = predict(fitted_lm, forecast_data)
return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(9, 0, 0))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = maproducts[event_date <= current_date]
forecast_data = maproducts[event_date == test_dates[i]]
# first lm models
fmla = "log_sold ~ as.factor(month) + price + category_sold +
lag_1 + lag_2 + outlier_great"
forecasted = forecast_with_lr(fmla, past_data, forecast_data)
forecast_data[, `:=`(lm_prediction, forecasted$forecast)]
# arima model with auto.arima
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"log_sold", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "log_sold"),
c("lm_prediction", "arima_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date log_sold variable value
## 1: 2021-05-11 5.894403 lm_prediction 6.072349
## 2: 2021-05-12 6.257668 lm_prediction 6.219121
## 3: 2021-05-13 6.327937 lm_prediction 6.120971
## 4: 2021-05-14 6.238325 lm_prediction 6.264525
## 5: 2021-05-15 6.309918 lm_prediction 6.344856
## 6: 2021-05-16 6.272877 lm_prediction 6.087698
## 7: 2021-05-17 5.680173 lm_prediction 5.943813
## 8: 2021-05-18 5.707110 lm_prediction 5.858567
## 9: 2021-05-19 5.609472 lm_prediction 5.804912
## 10: 2021-05-20 5.541264 lm_prediction 5.701320
## 11: 2021-05-21 5.433722 lm_prediction 5.731810
## 12: 2021-05-22 5.564520 lm_prediction 5.811365
## 13: 2021-05-23 5.648974 lm_prediction 5.747404
## 14: 2021-05-24 5.480639 lm_prediction 5.650380
## 15: 2021-05-25 5.375278 lm_prediction 5.657188
## 16: 2021-05-26 5.463832 lm_prediction 5.727750
## 17: 2021-05-27 5.638355 lm_prediction 5.649980
## 18: 2021-05-28 5.480639 lm_prediction 5.783207
## 19: 2021-05-29 5.755742 lm_prediction 7.053687
## 20: 2021-05-11 5.894403 arima_prediction 6.582012
## 21: 2021-05-12 6.257668 arima_prediction 6.581654
## 22: 2021-05-13 6.327937 arima_prediction 6.582530
## 23: 2021-05-14 6.238325 arima_prediction 6.582511
## 24: 2021-05-15 6.309918 arima_prediction 6.581589
## 25: 2021-05-16 6.272877 arima_prediction 6.581623
## 26: 2021-05-17 5.680173 arima_prediction 6.581908
## 27: 2021-05-18 5.707110 arima_prediction 6.581376
## 28: 2021-05-19 5.609472 arima_prediction 6.581444
## 29: 2021-05-20 5.541264 arima_prediction 6.581291
## 30: 2021-05-21 5.433722 arima_prediction 6.580898
## 31: 2021-05-22 5.564520 arima_prediction 6.577849
## 32: 2021-05-23 5.648974 arima_prediction 6.577888
## 33: 2021-05-24 5.480639 arima_prediction 6.577618
## 34: 2021-05-25 5.375278 arima_prediction 6.577243
## 35: 2021-05-26 5.463832 arima_prediction 6.575795
## 36: 2021-05-27 5.638355 arima_prediction 6.575985
## 37: 2021-05-28 5.480639 arima_prediction 6.576003
## 38: 2021-05-29 5.755742 arima_prediction 6.574101
## event_date log_sold variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(log_sold, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: lm_prediction 19 5.772676 0.3352264 0.05807123 -0.03236714 0.0408488
## 2: arima_prediction 19 5.772676 0.3352264 0.05807123 -0.13977345 0.1432785
## RMSE MAD MADP WMAPE
## 1: 0.3540892 0.2321811 0.0402207 0.0402207
## 2: 0.8697548 0.8068669 0.1397734 0.1397734
First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.
performance = melted_result[, accu(log_sold, value), by = list(event_date,
variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]
ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) +
geom_boxplot()
ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) +
geom_boxplot()
As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.
So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.
price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)],
1)
price_model_forecast = price_model_forecast + last_trend_value +
seasonality
model_cat <- auto.arima(maproducts$category_sold)
cat_fcast <- predict(model_cat, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
lag_2_ts <- ts(maproducts$lag_2, frequency = 7)
lag_2_dec <- decompose(x = lag_2_ts, type = "additive")
lag_2_model = auto.arima(lag_2_dec$random)
lag_2_model_forecast <- predict(lag_2_model, n.ahead = 15)$pred
seasonality = lag_2_dec$seasonal[1:1]
last_trend_value <- tail(lag_2_dec$trend[!is.na(lag_2_dec$trend)],
1)
lag_2_model_forecast = lag_2_model_forecast + last_trend_value +
seasonality
outlier_ts <- ts(maproducts$outlier_great, frequency = 6)
outlier_dec <- decompose(x = outlier_ts, type = "additive")
outlier_model = auto.arima(outlier_dec$random)
outlier_model_forecast <- predict(outlier_model, n.ahead = 15)$pred
seasonality = outlier_dec$seasonal[1:1]
last_trend_value <- tail(outlier_dec$trend[!is.na(outlier_dec$trend)],
1)
outlier_model_forecast = outlier_model_forecast + last_trend_value +
seasonality
Now,we forecast our value with our forecasted parameters.
forecast_Log <- predict(lm_model2_1, data.frame(month = as.factor(6),
price = price_model_forecast[1], category_sold = cat_fcast[1],
lag_1 = lag1_fcast[1], lag_2 = lag_2_model_forecast[1], outlier_great = outlier_model_forecast[1]))
fcast <- exp(forecast_Log)
##Xiaomi Bluetooth Kulaklık
After filtering our whole data for Xiaomi Bluetooth Kulaklık,we plot graph for amount that has been sold.
maproducts <- data[data$product_content_id == 6676673]
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()
That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.
maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]
head(maproducts, 10)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 6676673 120.6812 345 19578 1261
## 2: 2021-05-30 6676673 122.3905 342 16238 1214
## 3: 2021-05-29 6676673 123.7674 286 15222 1063
## 4: 2021-05-28 6676673 126.5967 258 13615 966
## 5: 2021-05-27 6676673 126.6857 268 12907 937
## 6: 2021-05-26 6676673 129.5155 233 13614 942
## 7: 2021-05-25 6676673 130.1068 280 15730 1160
## 8: 2021-05-24 6676673 122.8083 384 18266 1467
## 9: 2021-05-23 6676673 111.5975 562 26286 2069
## 10: 2021-05-22 6676673 114.6897 535 24943 1862
## favored_count category_sold category_visits category_basket
## 1: 1510 4944 306462 23418
## 2: 947 4442 298876 21479
## 3: 880 4437 286091 21416
## 4: 657 647 4604 270790
## 5: 638 703 4637 273490
## 6: 657 746 4842 276308
## 7: 810 726 4719 283543
## 8: 1032 809 5140 288733
## 9: 1560 982 5304 347131
## 10: 1342 979 5378 339929
## category_favored category_brand_sold ty_visits month day
## 1: 26597 786 125439876 5 2
## 2: 23809 696 131821083 5 1
## 3: 20727 653 129670029 5 7
## 4: 19955 19556 103514886 5 6
## 5: 20959 20972 107391579 5 5
## 6: 21197 18830 106195988 5 4
## 7: 21665 18239 107004119 5 3
## 8: 22203 20886 108235639 5 2
## 9: 25978 26571 134993625 5 1
## 10: 24575 22101 133292217 5 7
maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()
Variance of log of sold products seems more stable .Therefore,we start building linear regression models with logarithmic data.
lm_model <- lm(maproducts, formula = log_sold ~ month + day +
price + visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2728 -0.2746 0.0323 0.2611 1.3832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.661e+00 2.943e-01 32.823 <2e-16 ***
## month 1.147e-02 8.258e-03 1.389 0.166
## day -4.220e-03 1.147e-02 -0.368 0.713
## price -2.836e-02 2.135e-03 -13.284 <2e-16 ***
## visit_count -1.371e-06 2.568e-06 -0.534 0.594
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4417 on 367 degrees of freedom
## Multiple R-squared: 0.3684, Adjusted R-squared: 0.3615
## F-statistic: 53.51 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28501 -0.27663 0.03077 0.26458 1.38308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.639e+00 2.877e-01 33.499 <2e-16 ***
## month 1.140e-02 8.246e-03 1.383 0.168
## price -2.832e-02 2.129e-03 -13.300 <2e-16 ***
## visit_count -1.374e-06 2.565e-06 -0.536 0.593
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4412 on 368 degrees of freedom
## Multiple R-squared: 0.3681, Adjusted R-squared: 0.363
## F-statistic: 71.46 on 3 and 368 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.04989 -0.26063 -0.01178 0.27438 1.50117
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.017e+00 2.769e-01 32.562 < 2e-16 ***
## month 3.444e-03 7.683e-03 0.448 0.654
## price -2.711e-02 1.973e-03 -13.744 < 2e-16 ***
## visit_count 1.311e-05 2.982e-06 4.398 1.43e-05 ***
## trend 2.216e-03 2.767e-04 8.007 1.57e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4076 on 367 degrees of freedom
## Multiple R-squared: 0.4621, Adjusted R-squared: 0.4562
## F-statistic: 78.82 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.90224 -0.24750 0.00356 0.21453 1.59387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.907e+00 2.558e-01 34.819 < 2e-16 ***
## month 7.292e-03 7.103e-03 1.027 0.305
## price -2.805e-02 1.823e-03 -15.385 < 2e-16 ***
## visit_count 3.372e-06 3.002e-06 1.123 0.262
## trend 2.809e-03 2.656e-04 10.578 < 2e-16 ***
## favored_count 2.457e-04 3.039e-05 8.086 9.14e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.376 on 366 degrees of freedom
## Multiple R-squared: 0.5436, Adjusted R-squared: 0.5374
## F-statistic: 87.19 on 5 and 366 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23099 -0.26339 0.01086 0.24428 1.42204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.674e+00 2.799e-01 34.563 < 2e-16 ***
## month 1.522e-02 8.060e-03 1.888 0.05975 .
## price -2.912e-02 2.078e-03 -14.017 < 2e-16 ***
## visit_count -1.007e-05 3.104e-06 -3.244 0.00129 **
## favored_count 1.568e-04 3.333e-05 4.706 3.59e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.429 on 367 degrees of freedom
## Multiple R-squared: 0.4041, Adjusted R-squared: 0.3976
## F-statistic: 62.21 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92261 -0.24026 0.00556 0.20907 1.59245
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.028e+00 2.321e-01 38.897 <2e-16 ***
## month 5.767e-03 6.974e-03 0.827 0.409
## price -2.860e-02 1.758e-03 -16.266 <2e-16 ***
## trend 2.683e-03 2.407e-04 11.147 <2e-16 ***
## favored_count 2.594e-04 2.785e-05 9.316 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3761 on 367 degrees of freedom
## Multiple R-squared: 0.542, Adjusted R-squared: 0.5371
## F-statistic: 108.6 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80049 -0.19063 0.02167 0.19808 0.90490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.770e+00 2.384e-01 32.590 < 2e-16 ***
## month 6.100e-03 6.055e-03 1.008 0.314
## price -2.271e-02 1.619e-03 -14.025 < 2e-16 ***
## visit_count 1.791e-06 2.563e-06 0.699 0.485
## trend 3.141e-03 2.281e-04 13.770 < 2e-16 ***
## favored_count 2.071e-04 2.611e-05 7.932 2.66e-14 ***
## category_sold 4.516e-04 3.834e-05 11.780 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3205 on 365 degrees of freedom
## Multiple R-squared: 0.6693, Adjusted R-squared: 0.6639
## F-statistic: 123.1 on 6 and 365 DF, p-value: < 2.2e-16
After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘maproducts,formula=log_sold~month+price+visit_count+trend+favored_count+category_sold’. Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.
maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
# View(maproducts)
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + lag_1)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + lag_1, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41945 -0.13333 0.00662 0.16028 0.90177
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.280e+00 3.078e-01 13.904 < 2e-16 ***
## month 7.230e-03 4.687e-03 1.543 0.124
## price -1.341e-02 1.407e-03 -9.537 < 2e-16 ***
## visit_count 1.161e-06 1.984e-06 0.585 0.559
## trend 1.916e-03 1.978e-04 9.687 < 2e-16 ***
## favored_count 1.136e-04 2.115e-05 5.369 1.42e-07 ***
## category_sold 3.894e-04 3.564e-05 10.926 < 2e-16 ***
## lag_1 4.403e-01 3.277e-02 13.434 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.248 on 363 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8031, Adjusted R-squared: 0.7993
## F-statistic: 211.6 on 7 and 363 DF, p-value: < 2.2e-16
# 10.06
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + category_sold + lag_1 + lag_2 + as.factor(day))
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold +
## lag_1 + lag_2 + as.factor(day), data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.87893 -0.10671 0.01649 0.14189 0.75873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.091e+00 3.122e-01 13.105 <2e-16 ***
## month 3.777e-03 4.557e-03 0.829 0.4077
## price -1.247e-02 1.375e-03 -9.069 <2e-16 ***
## trend 1.347e-03 1.530e-04 8.807 <2e-16 ***
## category_sold 5.021e-04 4.103e-05 12.235 <2e-16 ***
## lag_1 4.678e-01 4.634e-02 10.095 <2e-16 ***
## lag_2 -7.137e-03 4.096e-02 -0.174 0.8618
## as.factor(day)2 6.832e-02 4.847e-02 1.410 0.1595
## as.factor(day)3 7.713e-02 4.868e-02 1.584 0.1140
## as.factor(day)4 6.508e-02 4.881e-02 1.333 0.1833
## as.factor(day)5 9.323e-02 4.924e-02 1.893 0.0591 .
## as.factor(day)6 8.733e-02 4.892e-02 1.785 0.0751 .
## as.factor(day)7 4.854e-02 4.898e-02 0.991 0.3223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.247 on 357 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8079, Adjusted R-squared: 0.8014
## F-statistic: 125.1 on 12 and 357 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + category_sold + lag_1 + lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + category_sold +
## lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94258 -0.11454 0.02477 0.14676 0.76518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.180e+00 3.023e-01 13.830 <2e-16 ***
## month 3.848e-03 4.548e-03 0.846 0.398
## price -1.248e-02 1.355e-03 -9.209 <2e-16 ***
## trend 1.372e-03 1.520e-04 9.023 <2e-16 ***
## category_sold 5.171e-04 4.021e-05 12.859 <2e-16 ***
## lag_1 4.697e-01 4.532e-02 10.364 <2e-16 ***
## lag_2 -1.634e-02 4.016e-02 -0.407 0.684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2466 on 363 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8053, Adjusted R-squared: 0.8021
## F-statistic: 250.2 on 6 and 363 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + lag_1 +
lag_2)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.79701 -0.12755 0.02016 0.15447 0.73915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.472e+00 3.012e-01 14.847 < 2e-16 ***
## month 7.660e-03 4.480e-03 1.710 0.0881 .
## price -1.359e-02 1.345e-03 -10.107 < 2e-16 ***
## visit_count 6.683e-07 1.892e-06 0.353 0.7241
## trend 1.973e-03 1.891e-04 10.434 < 2e-16 ***
## favored_count 1.078e-04 2.026e-05 5.319 1.83e-07 ***
## category_sold 4.975e-04 3.875e-05 12.839 < 2e-16 ***
## lag_1 4.310e-01 4.392e-02 9.812 < 2e-16 ***
## lag_2 -3.568e-02 3.862e-02 -0.924 0.3562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2363 on 361 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8222, Adjusted R-squared: 0.8183
## F-statistic: 208.7 on 8 and 361 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + lag_1 +
lag_2 + category_favored)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + lag_1 + lag_2 + category_favored,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.45142 -0.13499 0.02015 0.16021 0.70440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.791e+00 3.096e-01 15.473 < 2e-16 ***
## month 4.512e-03 4.496e-03 1.004 0.316276
## price -1.558e-02 1.436e-03 -10.848 < 2e-16 ***
## visit_count 1.081e-06 1.866e-06 0.579 0.562858
## trend 2.321e-03 2.102e-04 11.043 < 2e-16 ***
## favored_count 1.028e-04 1.999e-05 5.143 4.44e-07 ***
## category_sold 4.141e-04 4.473e-05 9.257 < 2e-16 ***
## lag_1 4.056e-01 4.381e-02 9.258 < 2e-16 ***
## lag_2 -4.091e-02 3.804e-02 -1.076 0.282825
## category_favored 6.939e-06 1.947e-06 3.565 0.000413 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2325 on 360 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8283, Adjusted R-squared: 0.824
## F-statistic: 192.9 on 9 and 360 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + favored_count + category_sold + lag_1 + lag_2 +
category_favored)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + lag_1 + lag_2 + category_favored, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.57855 -0.10165 0.01745 0.12739 0.45999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.427e+00 3.377e-01 19.033 < 2e-16 ***
## as.factor(month)2 -3.978e-02 5.577e-02 -0.713 0.476174
## as.factor(month)3 -4.196e-01 6.367e-02 -6.590 1.61e-10 ***
## as.factor(month)4 -6.162e-02 6.030e-02 -1.022 0.307532
## as.factor(month)5 6.037e-02 5.819e-02 1.037 0.300258
## as.factor(month)6 -6.674e-02 7.854e-02 -0.850 0.396061
## as.factor(month)7 -1.574e-01 7.971e-02 -1.974 0.049147 *
## as.factor(month)8 -1.890e-03 7.056e-02 -0.027 0.978642
## as.factor(month)9 -2.375e-01 6.819e-02 -3.483 0.000558 ***
## as.factor(month)10 -6.981e-02 6.753e-02 -1.034 0.301980
## as.factor(month)11 -1.906e-01 6.879e-02 -2.772 0.005876 **
## as.factor(month)12 2.509e-02 5.475e-02 0.458 0.647024
## price -2.143e-02 1.628e-03 -13.166 < 2e-16 ***
## trend 2.666e-03 2.762e-04 9.650 < 2e-16 ***
## favored_count 6.740e-05 2.070e-05 3.256 0.001238 **
## category_sold 4.090e-04 4.074e-05 10.041 < 2e-16 ***
## lag_1 2.958e-01 4.085e-02 7.240 2.85e-12 ***
## lag_2 -8.038e-02 3.479e-02 -2.310 0.021440 *
## category_favored 1.260e-05 2.030e-06 6.207 1.52e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.208 on 351 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.866, Adjusted R-squared: 0.8591
## F-statistic: 126 on 18 and 351 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + favored_count + category_sold + lag_1 + lag_2 +
category_favored)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + lag_1 + lag_2 + category_favored, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.57855 -0.10165 0.01745 0.12739 0.45999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.427e+00 3.377e-01 19.033 < 2e-16 ***
## as.factor(month)2 -3.978e-02 5.577e-02 -0.713 0.476174
## as.factor(month)3 -4.196e-01 6.367e-02 -6.590 1.61e-10 ***
## as.factor(month)4 -6.162e-02 6.030e-02 -1.022 0.307532
## as.factor(month)5 6.037e-02 5.819e-02 1.037 0.300258
## as.factor(month)6 -6.674e-02 7.854e-02 -0.850 0.396061
## as.factor(month)7 -1.574e-01 7.971e-02 -1.974 0.049147 *
## as.factor(month)8 -1.890e-03 7.056e-02 -0.027 0.978642
## as.factor(month)9 -2.375e-01 6.819e-02 -3.483 0.000558 ***
## as.factor(month)10 -6.981e-02 6.753e-02 -1.034 0.301980
## as.factor(month)11 -1.906e-01 6.879e-02 -2.772 0.005876 **
## as.factor(month)12 2.509e-02 5.475e-02 0.458 0.647024
## price -2.143e-02 1.628e-03 -13.166 < 2e-16 ***
## trend 2.666e-03 2.762e-04 9.650 < 2e-16 ***
## favored_count 6.740e-05 2.070e-05 3.256 0.001238 **
## category_sold 4.090e-04 4.074e-05 10.041 < 2e-16 ***
## lag_1 2.958e-01 4.085e-02 7.240 2.85e-12 ***
## lag_2 -8.038e-02 3.479e-02 -2.310 0.021440 *
## category_favored 1.260e-05 2.030e-06 6.207 1.52e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.208 on 351 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.866, Adjusted R-squared: 0.8591
## F-statistic: 126 on 18 and 351 DF, p-value: < 2.2e-16
That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ as.factor(month) + price + trend + favored_count + category_sold + lag_1 + lag_2 + category_favored’.
From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.
maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals,
0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals,
0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + category_sold + lag_1 + lag_2 + outlier_small + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold +
## lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10580 -0.10063 0.00898 0.12142 0.70804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.496e+00 3.672e-01 14.969 < 2e-16 ***
## as.factor(month)2 -9.901e-02 6.328e-02 -1.565 0.118562
## as.factor(month)3 -4.949e-01 6.918e-02 -7.154 4.91e-12 ***
## as.factor(month)4 -2.390e-01 6.608e-02 -3.617 0.000342 ***
## as.factor(month)5 -1.460e-01 6.171e-02 -2.367 0.018490 *
## as.factor(month)6 2.617e-01 6.317e-02 4.143 4.30e-05 ***
## as.factor(month)7 7.182e-02 6.589e-02 1.090 0.276475
## as.factor(month)8 1.703e-01 6.220e-02 2.738 0.006498 **
## as.factor(month)9 -4.765e-02 6.332e-02 -0.753 0.452212
## as.factor(month)10 8.910e-02 6.777e-02 1.315 0.189405
## as.factor(month)11 -1.008e-02 6.394e-02 -0.158 0.874822
## as.factor(month)12 1.143e-01 6.225e-02 1.835 0.067284 .
## price -1.698e-02 1.783e-03 -9.519 < 2e-16 ***
## category_sold 5.078e-04 3.987e-05 12.738 < 2e-16 ***
## lag_1 4.175e-01 4.503e-02 9.271 < 2e-16 ***
## lag_2 -3.025e-02 3.995e-02 -0.757 0.449330
## outlier_small -6.693e-03 5.862e-02 -0.114 0.909157
## outlier_great 8.211e-02 5.909e-02 1.390 0.165539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2383 on 352 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8237, Adjusted R-squared: 0.8152
## F-statistic: 96.73 on 17 and 352 DF, p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + category_sold + lag_1 + lag_2 + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + category_sold +
## lag_1 + lag_2 + outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10468 -0.10017 0.00919 0.12154 0.70849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.495e+00 3.666e-01 14.992 < 2e-16 ***
## as.factor(month)2 -9.956e-02 6.300e-02 -1.580 0.114974
## as.factor(month)3 -4.953e-01 6.900e-02 -7.178 4.20e-12 ***
## as.factor(month)4 -2.396e-01 6.579e-02 -3.642 0.000311 ***
## as.factor(month)5 -1.466e-01 6.145e-02 -2.386 0.017576 *
## as.factor(month)6 2.614e-01 6.302e-02 4.148 4.21e-05 ***
## as.factor(month)7 7.168e-02 6.579e-02 1.090 0.276650
## as.factor(month)8 1.702e-01 6.211e-02 2.741 0.006440 **
## as.factor(month)9 -4.800e-02 6.315e-02 -0.760 0.447748
## as.factor(month)10 8.862e-02 6.754e-02 1.312 0.190332
## as.factor(month)11 -1.066e-02 6.364e-02 -0.168 0.867018
## as.factor(month)12 1.137e-01 6.198e-02 1.835 0.067399 .
## price -1.697e-02 1.779e-03 -9.536 < 2e-16 ***
## category_sold 5.077e-04 3.979e-05 12.758 < 2e-16 ***
## lag_1 4.179e-01 4.485e-02 9.316 < 2e-16 ***
## lag_2 -3.063e-02 3.976e-02 -0.770 0.441591
## outlier_great 8.251e-02 5.890e-02 1.401 0.162089
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.238 on 353 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8237, Adjusted R-squared: 0.8157
## F-statistic: 103.1 on 16 and 353 DF, p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + favored_count + category_sold + lag_1 + lag_2 +
category_favored + outlier_small + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + lag_1 + lag_2 + category_favored + outlier_small +
## outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55511 -0.10489 0.02323 0.12605 0.49855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.620e+00 3.392e-01 19.512 < 2e-16 ***
## as.factor(month)2 -4.963e-02 5.540e-02 -0.896 0.370961
## as.factor(month)3 -4.435e-01 6.345e-02 -6.990 1.40e-11 ***
## as.factor(month)4 -7.622e-02 6.007e-02 -1.269 0.205321
## as.factor(month)5 4.010e-02 5.807e-02 0.691 0.490274
## as.factor(month)6 -8.651e-02 7.789e-02 -1.111 0.267442
## as.factor(month)7 -1.865e-01 7.932e-02 -2.352 0.019252 *
## as.factor(month)8 -1.804e-02 6.995e-02 -0.258 0.796578
## as.factor(month)9 -2.540e-01 6.761e-02 -3.756 0.000202 ***
## as.factor(month)10 -8.392e-02 6.693e-02 -1.254 0.210702
## as.factor(month)11 -1.962e-01 6.803e-02 -2.884 0.004173 **
## as.factor(month)12 3.032e-02 5.426e-02 0.559 0.576682
## price -2.252e-02 1.646e-03 -13.687 < 2e-16 ***
## trend 2.804e-03 2.765e-04 10.142 < 2e-16 ***
## favored_count 6.810e-05 2.051e-05 3.321 0.000991 ***
## category_sold 4.094e-04 4.027e-05 10.165 < 2e-16 ***
## lag_1 2.718e-01 4.114e-02 6.606 1.47e-10 ***
## lag_2 -6.826e-02 3.466e-02 -1.970 0.049673 *
## category_favored 1.291e-05 2.011e-06 6.418 4.52e-10 ***
## outlier_small -3.120e-02 5.080e-02 -0.614 0.539580
## outlier_great 1.597e-01 5.157e-02 3.096 0.002123 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2056 on 349 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8698, Adjusted R-squared: 0.8624
## F-statistic: 116.6 on 20 and 349 DF, p-value: < 2.2e-16
lm_model2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + favored_count + category_sold + lag_1 + lag_2 +
category_favored + outlier_great)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + lag_1 + lag_2 + category_favored + outlier_great,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55181 -0.10312 0.02275 0.12664 0.49794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.609e+00 3.385e-01 19.524 < 2e-16 ***
## as.factor(month)2 -5.195e-02 5.522e-02 -0.941 0.347438
## as.factor(month)3 -4.444e-01 6.338e-02 -7.012 1.22e-11 ***
## as.factor(month)4 -7.916e-02 5.982e-02 -1.323 0.186609
## as.factor(month)5 3.797e-02 5.792e-02 0.656 0.512502
## as.factor(month)6 -8.605e-02 7.781e-02 -1.106 0.269553
## as.factor(month)7 -1.853e-01 7.922e-02 -2.339 0.019912 *
## as.factor(month)8 -1.673e-02 6.985e-02 -0.239 0.810870
## as.factor(month)9 -2.538e-01 6.755e-02 -3.758 0.000201 ***
## as.factor(month)10 -8.449e-02 6.686e-02 -1.264 0.207196
## as.factor(month)11 -1.966e-01 6.796e-02 -2.893 0.004053 **
## as.factor(month)12 2.804e-02 5.408e-02 0.518 0.604496
## price -2.246e-02 1.641e-03 -13.688 < 2e-16 ***
## trend 2.797e-03 2.760e-04 10.134 < 2e-16 ***
## favored_count 6.891e-05 2.045e-05 3.370 0.000835 ***
## category_sold 4.091e-04 4.023e-05 10.168 < 2e-16 ***
## lag_1 2.739e-01 4.095e-02 6.690 8.86e-11 ***
## lag_2 -6.992e-02 3.452e-02 -2.025 0.043598 *
## category_favored 1.283e-05 2.006e-06 6.398 5.05e-10 ***
## outlier_great 1.613e-01 5.145e-02 3.136 0.001859 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2055 on 350 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8697, Adjusted R-squared: 0.8626
## F-statistic: 122.9 on 19 and 350 DF, p-value: < 2.2e-16
In this model,we can add week variable to model to examine its effectiveness in the model.
# weekly
maproducts[, `:=`(weeks, week(event_date))]
head(maproducts)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 6676673 120.6812 345 19578 1261
## 2: 2021-05-30 6676673 122.3905 342 16238 1214
## 3: 2021-05-29 6676673 123.7674 286 15222 1063
## 4: 2021-05-28 6676673 126.5967 258 13615 966
## 5: 2021-05-27 6676673 126.6857 268 12907 937
## 6: 2021-05-26 6676673 129.5155 233 13614 942
## favored_count category_sold category_visits category_basket category_favored
## 1: 1510 4944 306462 23418 26597
## 2: 947 4442 298876 21479 23809
## 3: 880 4437 286091 21416 20727
## 4: 657 647 4604 270790 19955
## 5: 638 703 4637 273490 20959
## 6: 657 746 4842 276308 21197
## category_brand_sold ty_visits month day trend log_sold lag_1 lag_2
## 1: 786 125439876 5 2 1 5.843544 NA NA
## 2: 696 131821083 5 1 2 5.834811 5.843544 NA
## 3: 653 129670029 5 7 3 5.655992 5.834811 5.843544
## 4: 19556 103514886 5 6 4 5.552960 5.655992 5.834811
## 5: 20972 107391579 5 5 5 5.590987 5.552960 5.655992
## 6: 18830 106195988 5 4 6 5.451038 5.590987 5.552960
## residuals quant5 quant95 outlier_small outlier_great weeks
## 1: NA NA NA NA NA 22
## 2: -1.578550704 -0.2772592 0.2830867 1 0 22
## 3: 0.003586055 -0.2772592 0.2830867 0 0 22
## 4: 0.022684073 -0.2772592 0.2830867 0 0 22
## 5: -0.100691835 -0.2772592 0.2830867 0 0 21
## 6: 0.129492494 -0.2772592 0.2830867 0 0 21
lm_model3_2 <- lm(maproducts, formula = log_sold ~ as.factor(month) +
price + trend + favored_count + category_sold + lag_1 + lag_2 +
category_favored + outlier_great + weeks)
summary(lm_model3_2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + lag_1 + lag_2 + category_favored + outlier_great +
## weeks, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.51234 -0.10238 0.01298 0.12121 0.50255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.779e+00 3.494e-01 19.401 < 2e-16 ***
## as.factor(month)2 1.490e-02 6.564e-02 0.227 0.820555
## as.factor(month)3 -3.178e-01 9.260e-02 -3.432 0.000670 ***
## as.factor(month)4 1.273e-01 1.256e-01 1.014 0.311275
## as.factor(month)5 3.190e-01 1.611e-01 1.980 0.048466 *
## as.factor(month)6 2.550e-01 1.983e-01 1.286 0.199332
## as.factor(month)7 2.293e-01 2.355e-01 0.974 0.330940
## as.factor(month)8 4.673e-01 2.682e-01 1.742 0.082363 .
## as.factor(month)9 2.959e-01 3.018e-01 0.980 0.327556
## as.factor(month)10 5.355e-01 3.384e-01 1.582 0.114484
## as.factor(month)11 4.975e-01 3.776e-01 1.317 0.188540
## as.factor(month)12 7.966e-01 4.148e-01 1.920 0.055638 .
## price -2.275e-02 1.642e-03 -13.853 < 2e-16 ***
## trend 2.832e-03 2.757e-04 10.273 < 2e-16 ***
## favored_count 6.908e-05 2.037e-05 3.391 0.000777 ***
## category_sold 4.058e-04 4.013e-05 10.112 < 2e-16 ***
## lag_1 2.722e-01 4.081e-02 6.670 1.00e-10 ***
## lag_2 -8.312e-02 3.512e-02 -2.367 0.018482 *
## category_favored 1.293e-05 2.000e-06 6.468 3.36e-10 ***
## outlier_great 1.577e-01 5.131e-02 3.074 0.002276 **
## weeks -1.612e-02 8.625e-03 -1.869 0.062528 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2047 on 349 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.871, Adjusted R-squared: 0.8636
## F-statistic: 117.8 on 20 and 349 DF, p-value: < 2.2e-16
# pick model 3
According to r-squared value,week variable makes contribution to our model. For another perspective AIC values of models.
require(ursa)
AIC(lm_model)
## [1] -91.34382
AIC(lm_model2)
## [1] -99.59528
AIC(lm_model3_2)
## [1] -101.2783
That proves claim of model with month as factor variable.
Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.
# time series and arima
ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)
Box.test(ts_sold, lag = 7, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: ts_sold
## X-squared = 700.49, df = 7, p-value < 2.2e-16
Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data
sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)
Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.
random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)
pacf(sold_decomp$random, na.action = na.pass)
As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.
ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.3298 -0.2968 -0.0009
## s.e. 0.0499 0.0498 0.0133
##
## sigma^2 estimated as 0.0601: log likelihood = -4.89, aic = 17.79
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -8.385561e-05 0.2451437 0.1740224 54.28193 145.6099 0.7532093
## ACF1
## Training set -0.09301317
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.2346 -0.1924 -0.3165 -0.0005
## s.e. 0.0495 0.0499 0.0495 0.0096
##
## sigma^2 estimated as 0.05402: log likelihood = 14.45, aic = -18.9
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004537202 0.2324248 0.1687795 62.61691 194.2566 0.7305164
## ACF1
## Training set -0.0294018
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.2033 -0.2116 -0.2938 -0.098 -0.0004
## s.e. 0.0520 0.0508 0.0507 0.052 0.0087
##
## sigma^2 estimated as 0.0535: log likelihood = 16.21, aic = -20.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0005966523 0.2312937 0.1683734 83.18963 180.3084 0.7287591
## ACF1
## Training set -0.02297017
ar_model <- arima(random, order = c(5, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(5, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 intercept
## 0.1784 -0.2863 -0.3477 -0.0472 -0.2518 -0.0002
## s.e. 0.0506 0.0513 0.0502 0.0513 0.0505 0.0067
##
## sigma^2 estimated as 0.05005: log likelihood = 28.24, aic = -42.48
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0008350554 0.2237166 0.1612522 64.79473 189.4283 0.6979366
## ACF1
## Training set -0.04320653
ar_model <- arima(random, order = c(6, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(6, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 intercept
## 0.1335 -0.2945 -0.4094 -0.0979 -0.2206 -0.1781 -0.0002
## s.e. 0.0514 0.0505 0.0526 0.0526 0.0504 0.0515 0.0056
##
## sigma^2 estimated as 0.04844: log likelihood = 34.13, aic = -52.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0007877407 0.2200882 0.1584303 47.56604 183.0987 0.685723
## ACF1
## Training set -0.006392688
ar_model <- arima(random, order = c(7, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(7, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 intercept
## 0.1259 -0.3039 -0.4135 -0.1153 -0.2332 -0.1728 -0.0428 -0.0002
## s.e. 0.0522 0.0518 0.0528 0.0567 0.0527 0.0518 0.0523 0.0054
##
## sigma^2 estimated as 0.04835: log likelihood = 34.46, aic = -50.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0007593015 0.2198832 0.1577712 50.28049 177.2777 0.6828702
## ACF1
## Training set -0.003008649
ar_model9 <- arima(random, order = c(8, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(7, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 intercept
## 0.1259 -0.3039 -0.4135 -0.1153 -0.2332 -0.1728 -0.0428 -0.0002
## s.e. 0.0522 0.0518 0.0528 0.0567 0.0527 0.0518 0.0523 0.0054
##
## sigma^2 estimated as 0.04835: log likelihood = 34.46, aic = -50.92
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0007593015 0.2198832 0.1577712 50.28049 177.2777 0.6828702
## ACF1
## Training set -0.003008649
Best model we have above is the model with (p,d,q)=(8,0,0).Then,we use auto arima function to compare our model with the model auto arima functions finds.
auto.arima(random, seasonal = FALSE, trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2) with non-zero mean : Inf
## ARIMA(0,0,0) with non-zero mean : 72.16319
## ARIMA(1,0,0) with non-zero mean : 50.53033
## ARIMA(0,0,1) with non-zero mean : 39.66487
## ARIMA(0,0,0) with zero mean : 70.14585
## ARIMA(1,0,1) with non-zero mean : 42.61354
## ARIMA(0,0,2) with non-zero mean : 41.70325
## ARIMA(1,0,2) with non-zero mean : Inf
## ARIMA(0,0,1) with zero mean : 37.63635
## ARIMA(1,0,1) with zero mean : 40.57179
## ARIMA(0,0,2) with zero mean : 39.6635
## ARIMA(1,0,0) with zero mean : 48.4987
## ARIMA(1,0,2) with zero mean : Inf
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,0,1) with zero mean : 37.73032
##
## Best model: ARIMA(0,0,1) with zero mean
## Series: random
## ARIMA(0,0,1) with zero mean
##
## Coefficients:
## ma1
## 0.3147
## s.e. 0.0456
##
## sigma^2 estimated as 0.06435: log likelihood=-16.85
## AIC=37.7 AICc=37.73 BIC=45.5
fitted_model <- arima(random, order = c(8, 0, 0))
AIC(fitted_model)
## [1] -52.42999
Our model is better off.
To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.
train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")
test_dates = seq(test_start, test_end, by = "day")
test_dates
## [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
## [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
fitted_lm = lm(as.formula(fmla), data)
forecasted = predict(fitted_lm, forecast_data)
return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(8, 0, 0))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = maproducts[event_date <= current_date]
forecast_data = maproducts[event_date == test_dates[i]]
# first lm models
fmla = "log_sold~as.factor(month)+price+trend+favored_count+category_sold+lag_1+lag_2+category_favored+outlier_great+weeks"
forecasted = forecast_with_lr(fmla, past_data, forecast_data)
forecast_data[, `:=`(lm_prediction, forecasted$forecast)]
# arima model with auto.arima
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"log_sold", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "log_sold"),
c("lm_prediction", "arima_prediction"))
We turned the results into melted results(long form).
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(log_sold, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: lm_prediction 19 5.858585 0.3934767 0.06716242 -0.03175431 0.06462302
## 2: arima_prediction 19 5.858585 0.3934767 0.06716242 -0.05680289 0.07000923
## RMSE MAD MADP WMAPE
## 1: 0.7764512 0.3621781 0.06182007 0.06182007
## 2: 0.5073968 0.3865690 0.06598334 0.06598334
First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.
performance = melted_result[, accu(log_sold, value), by = list(event_date,
variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]
ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) +
geom_boxplot()
ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) +
geom_boxplot()
As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.
So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.
price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 1966.841
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)],
1)
price_model_forecast = price_model_forecast + last_trend_value +
seasonality
price_model_forecast
## Time Series:
## Start = c(54, 2)
## End = c(56, 2)
## Frequency = 7
## [1] 136.5906 137.0957 137.3132 137.3432 137.0187 136.9371 136.9831 136.9288
## [9] 137.0708 137.1368 137.1460 137.0447 137.0193 137.0337 137.0167
model_fav <- auto.arima(maproducts$favored_count)
fav_fcast <- predict(model_fav, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
model_lag_2 <- auto.arima(maproducts$lag_2)
lag2_fcast <- predict(model_lag_2, n.ahead = 15)$pred
model_cat <- auto.arima(maproducts$category_sold)
cat_ffcast <- predict(model_cat, n.ahead = 15)$pred
cat_fav_ts <- ts(maproducts$category_favored, frequency = 7)
cat_fav_dec <- decompose(x = cat_fav_ts, type = "additive")
cat_fav_model = auto.arima(cat_fav_dec$random)
AIC(cat_fav_model)
## [1] 6957.51
cat_fav_model_forecast <- predict(cat_fav_model, n.ahead = 15)$pred
brand_ts <- ts(maproducts$category_brand_sold, frequency = 7)
brand_dec <- decompose(x = brand_ts, type = "additive")
brand_model = auto.arima(brand_dec$random)
AIC(brand_model)
## [1] 6921.524
brand_model_forecast <- predict(brand_model, n.ahead = 15)$pred
outlier_ts <- ts(maproducts$outlier_great, frequency = 7)
outlier_dec <- decompose(x = outlier_ts, type = "additive")
outlier_model = auto.arima(outlier_dec$random)
AIC(outlier_model)
## [1] -292.537
outlier_model_forecast <- predict(outlier_model, n.ahead = 15)$pred
# outlier forecast0.005171854 trend 373 cat fav 24381.21 week
# 21
forecast_log <- predict(lm_model3_2, data.frame(month = as.factor(6),
price = price_model_forecast[1], trend = 373, favored_count = fav_fcast[1],
category_sold = cat_ffcast[1], category_brand_sold = brand_model_forecast[1],
lag_1 = lag1_fcast[1], lag_2 = lag2_fcast[1], category_favored = cat_fav_model_forecast[1],
outlier_great = outlier_model_forecast[1], weeks = 21))
Now,we forecast our value with our forecasted parameters.
fcast2 <- exp(forecast_log)
fcast2
## 1
## 475.5609
##Fakir Dik Süpürge
After filtering our whole data for Fakir Dik Süpürge,we plot graph for amount that has been sold.
maproducts <- data[data$product_content_id == 7061886]
head(maproducts, 5)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 7061886 294.4167 24 1485 81
## 2: 2021-05-30 7061886 299.0000 11 1362 62
## 3: 2021-05-29 7061886 297.5714 14 1340 65
## 4: 2021-05-28 7061886 297.8889 9 972 62
## 5: 2021-05-27 7061886 299.0000 8 965 50
## favored_count category_sold category_visits category_basket category_favored
## 1: 122 951 95645 4704 8886
## 2: 100 810 92899 3988 8050
## 3: 123 790 91368 4386 7491
## 4: 76 84 705 70347 2987
## 5: 71 101 767 72503 3399
## category_brand_sold ty_visits
## 1: 179 125439876
## 2: 119 131821083
## 3: 112 129670029
## 4: 6453 103514886
## 5: 6539 107391579
require(ggplot2)
ggplot(maproducts, aes(x = event_date, y = sold_count)) + geom_line()
That plot has wide variance, which urges us to take logarithm of sale data.Moreover, we add day,month ,trend data for further process.Then,we plot log of sold products over time to see if we decrease variance or not.
maproducts[, `:=`(month, month(event_date))]
maproducts[, `:=`(day, lubridate::wday(event_date))]
head(maproducts, 10)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 7061886 294.4167 24 1485 81
## 2: 2021-05-30 7061886 299.0000 11 1362 62
## 3: 2021-05-29 7061886 297.5714 14 1340 65
## 4: 2021-05-28 7061886 297.8889 9 972 62
## 5: 2021-05-27 7061886 299.0000 8 965 50
## 6: 2021-05-26 7061886 299.0000 12 1098 63
## 7: 2021-05-25 7061886 295.0038 16 1138 65
## 8: 2021-05-24 7061886 299.0000 7 1137 53
## 9: 2021-05-23 7061886 298.1765 17 1540 82
## 10: 2021-05-22 7061886 297.7273 11 1515 58
## favored_count category_sold category_visits category_basket
## 1: 122 951 95645 4704
## 2: 100 810 92899 3988
## 3: 123 790 91368 4386
## 4: 76 84 705 70347
## 5: 71 101 767 72503
## 6: 71 104 785 71162
## 7: 93 112 803 71048
## 8: 98 106 795 78411
## 9: 132 143 835 100040
## 10: 120 122 887 103373
## category_favored category_brand_sold ty_visits month day
## 1: 8886 179 125439876 5 2
## 2: 8050 119 131821083 5 1
## 3: 7491 112 129670029 5 7
## 4: 2987 6453 103514886 5 6
## 5: 3399 6539 107391579 5 5
## 6: 3540 5162 106195988 5 4
## 7: 3258 5456 107004119 5 3
## 8: 3408 6025 108235639 5 2
## 9: 4032 8904 134993625 5 1
## 10: 4124 8167 133292217 5 7
maproducts[, `:=`(trend, 1:.N)]
maproducts[, `:=`(log_sold, log(sold_count))]
ggplot(maproducts, aes(x = event_date, y = log_sold)) + geom_line()
Variance of log of sold products seems more stable.Therefore,we start building linear regression models with logarithmic data.
lm_model <- lm(maproducts, formula = log_sold ~ month + day +
price + visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + day + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21797 -0.28910 0.01365 0.25433 1.88324
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.282e+00 4.201e-01 14.953 < 2e-16 ***
## month 8.944e-02 9.893e-03 9.040 < 2e-16 ***
## day -2.357e-02 1.413e-02 -1.668 0.0963 .
## price -1.351e-02 1.549e-03 -8.721 < 2e-16 ***
## visit_count 2.608e-04 3.363e-05 7.755 8.78e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5444 on 367 degrees of freedom
## Multiple R-squared: 0.3744, Adjusted R-squared: 0.3676
## F-statistic: 54.91 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.28849 -0.29884 0.00666 0.26591 1.93000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.196e+00 4.180e-01 14.824 < 2e-16 ***
## month 8.979e-02 9.914e-03 9.057 < 2e-16 ***
## price -1.356e-02 1.552e-03 -8.733 < 2e-16 ***
## visit_count 2.637e-04 3.367e-05 7.831 5.21e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5458 on 368 degrees of freedom
## Multiple R-squared: 0.3696, Adjusted R-squared: 0.3645
## F-statistic: 71.93 on 3 and 368 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.16167 -0.29608 0.03255 0.29471 1.77022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.236e+01 1.022e+00 12.087 < 2e-16 ***
## month 7.415e-02 9.696e-03 7.648 1.81e-13 ***
## price -3.259e-02 3.262e-03 -9.990 < 2e-16 ***
## visit_count 1.197e-04 3.877e-05 3.087 0.00218 **
## trend -5.312e-03 8.126e-04 -6.537 2.10e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5172 on 367 degrees of freedom
## Multiple R-squared: 0.4354, Adjusted R-squared: 0.4292
## F-statistic: 70.75 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.12389 -0.27978 0.01514 0.27842 1.83699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.383e+01 1.057e+00 13.078 < 2e-16 ***
## month 6.497e-02 9.719e-03 6.684 8.68e-11 ***
## price -3.789e-02 3.423e-03 -11.070 < 2e-16 ***
## visit_count 3.179e-05 4.315e-05 0.737 0.462
## trend -5.849e-03 8.042e-04 -7.273 2.15e-12 ***
## favored_count 1.375e-03 3.230e-04 4.258 2.62e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5056 on 366 degrees of freedom
## Multiple R-squared: 0.462, Adjusted R-squared: 0.4547
## F-statistic: 62.87 on 5 and 366 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96936 -0.18329 0.05771 0.28790 0.78442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.073e+01 9.076e-01 11.828 < 2e-16 ***
## month 4.209e-02 8.243e-03 5.106 5.31e-07 ***
## price -2.854e-02 2.926e-03 -9.753 < 2e-16 ***
## visit_count -3.011e-05 3.607e-05 -0.835 0.404
## trend -3.466e-03 6.912e-04 -5.015 8.31e-07 ***
## favored_count 1.528e-03 2.678e-04 5.706 2.39e-08 ***
## category_sold 2.145e-03 1.654e-04 12.971 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4188 on 365 degrees of freedom
## Multiple R-squared: 0.6318, Adjusted R-squared: 0.6257
## F-statistic: 104.4 on 6 and 365 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.27022 -0.28029 0.01254 0.24740 1.99072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.815e+00 4.636e-01 14.700 < 2e-16 ***
## month 8.422e-02 9.991e-03 8.430 8e-16 ***
## price -1.603e-02 1.750e-03 -9.161 < 2e-16 ***
## visit_count 2.100e-04 3.795e-05 5.533 6e-08 ***
## favored_count 1.007e-03 3.408e-04 2.955 0.00333 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5401 on 367 degrees of freedom
## Multiple R-squared: 0.3843, Adjusted R-squared: 0.3776
## F-statistic: 57.27 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + favored_count)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11449 -0.28832 0.02006 0.27857 1.83358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.2076655 0.9202488 15.439 < 2e-16 ***
## month 0.0618618 0.0087523 7.068 7.97e-12 ***
## price -0.0390015 0.0030733 -12.690 < 2e-16 ***
## trend -0.0061856 0.0006616 -9.349 < 2e-16 ***
## favored_count 0.0014892 0.0002835 5.253 2.54e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5052 on 367 degrees of freedom
## Multiple R-squared: 0.4612, Adjusted R-squared: 0.4554
## F-statistic: 78.55 on 4 and 367 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + category_brand_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96934 -0.20414 0.05615 0.29352 0.79521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.087e+01 9.038e-01 12.031 < 2e-16 ***
## month 4.544e-02 8.313e-03 5.466 8.55e-08 ***
## price -2.934e-02 2.928e-03 -10.022 < 2e-16 ***
## visit_count -5.821e-05 3.776e-05 -1.542 0.12402
## trend -3.373e-03 6.880e-04 -4.903 1.42e-06 ***
## favored_count 1.082e-03 3.263e-04 3.314 0.00101 **
## category_sold 2.114e-03 1.648e-04 12.829 < 2e-16 ***
## category_brand_sold 2.482e-05 1.049e-05 2.367 0.01847 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4162 on 364 degrees of freedom
## Multiple R-squared: 0.6374, Adjusted R-squared: 0.6304
## F-statistic: 91.39 on 7 and 364 DF, p-value: < 2.2e-16
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
visit_count + trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + visit_count + trend +
## favored_count + category_sold + category_brand_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96934 -0.20414 0.05615 0.29352 0.79521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.087e+01 9.038e-01 12.031 < 2e-16 ***
## month 4.544e-02 8.313e-03 5.466 8.55e-08 ***
## price -2.934e-02 2.928e-03 -10.022 < 2e-16 ***
## visit_count -5.821e-05 3.776e-05 -1.542 0.12402
## trend -3.373e-03 6.880e-04 -4.903 1.42e-06 ***
## favored_count 1.082e-03 3.263e-04 3.314 0.00101 **
## category_sold 2.114e-03 1.648e-04 12.829 < 2e-16 ***
## category_brand_sold 2.482e-05 1.049e-05 2.367 0.01847 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4162 on 364 degrees of freedom
## Multiple R-squared: 0.6374, Adjusted R-squared: 0.6304
## F-statistic: 91.39 on 7 and 364 DF, p-value: < 2.2e-16
# that is the best
lm_model <- lm(maproducts, formula = log_sold ~ month + price +
trend + favored_count + category_sold + category_brand_sold)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ month + price + trend + favored_count +
## category_sold + category_brand_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98687 -0.19292 0.05216 0.28444 0.82732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.027e+01 8.165e-01 12.580 < 2e-16 ***
## month 5.013e-02 7.752e-03 6.466 3.23e-10 ***
## price -2.752e-02 2.684e-03 -10.255 < 2e-16 ***
## trend -2.882e-03 6.110e-04 -4.717 3.41e-06 ***
## favored_count 9.863e-04 3.210e-04 3.072 0.00228 **
## category_sold 2.089e-03 1.643e-04 12.714 < 2e-16 ***
## category_brand_sold 1.973e-05 9.973e-06 1.979 0.04859 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.417 on 365 degrees of freedom
## Multiple R-squared: 0.635, Adjusted R-squared: 0.629
## F-statistic: 105.8 on 6 and 365 DF, p-value: < 2.2e-16
After evaluation of different models parameters of which is included in data, we choose model formula of which is ‘log_sold~month+price+trend+favored_count+category_sold+category_brand_sold’.
Furhermore,lag variables could violate our model.For that reason, we add lag1 and lag2 variables to model and examine their effectiveness.
maproducts[, `:=`(lag_1, NA)]
maproducts$lag_1[2:372] <- maproducts$log_sold[1:371]
maproducts[, `:=`(lag_2, NA)]
maproducts$lag_2[3:372] <- maproducts$log_sold[1:370]
lm_model <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold,
data = maproducts)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47670 -0.15665 0.00544 0.19455 0.81009
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.264e+01 8.687e-01 14.547 < 2e-16 ***
## as.factor(month)2 -1.242e-01 8.085e-02 -1.536 0.125316
## as.factor(month)3 1.719e-01 8.479e-02 2.028 0.043350 *
## as.factor(month)4 -1.171e-01 9.691e-02 -1.209 0.227585
## as.factor(month)5 -4.716e-02 8.215e-02 -0.574 0.566226
## as.factor(month)6 -4.277e-01 1.260e-01 -3.394 0.000766 ***
## as.factor(month)7 -7.801e-01 1.126e-01 -6.927 2.03e-11 ***
## as.factor(month)8 -8.433e-01 1.136e-01 -7.425 8.46e-13 ***
## as.factor(month)9 -1.872e-01 1.147e-01 -1.633 0.103442
## as.factor(month)10 3.438e-01 1.045e-01 3.291 0.001099 **
## as.factor(month)11 4.577e-01 1.037e-01 4.414 1.35e-05 ***
## as.factor(month)12 1.787e-01 8.417e-02 2.123 0.034453 *
## price -3.586e-02 2.779e-03 -12.903 < 2e-16 ***
## trend -1.628e-03 6.307e-04 -2.581 0.010266 *
## favored_count 8.743e-04 2.725e-04 3.209 0.001454 **
## category_sold 1.818e-03 1.317e-04 13.811 < 2e-16 ***
## category_brand_sold 3.862e-05 7.700e-06 5.016 8.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3067 on 355 degrees of freedom
## Multiple R-squared: 0.808, Adjusted R-squared: 0.7993
## F-statistic: 93.37 on 16 and 355 DF, p-value: < 2.2e-16
lm_model <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1, data = maproducts)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.42419 -0.14600 0.00883 0.17404 0.73865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.763e+00 9.480e-01 10.299 < 2e-16 ***
## as.factor(month)2 -9.232e-02 7.690e-02 -1.201 0.230742
## as.factor(month)3 1.329e-01 8.067e-02 1.648 0.100227
## as.factor(month)4 -1.076e-01 9.195e-02 -1.170 0.242845
## as.factor(month)5 -3.904e-02 7.803e-02 -0.500 0.617120
## as.factor(month)6 -3.353e-01 1.204e-01 -2.785 0.005641 **
## as.factor(month)7 -5.888e-01 1.113e-01 -5.292 2.13e-07 ***
## as.factor(month)8 -6.748e-01 1.113e-01 -6.065 3.40e-09 ***
## as.factor(month)9 -1.652e-01 1.089e-01 -1.518 0.130008
## as.factor(month)10 2.422e-01 1.004e-01 2.413 0.016352 *
## as.factor(month)11 3.022e-01 1.013e-01 2.983 0.003048 **
## as.factor(month)12 1.344e-01 8.017e-02 1.677 0.094435 .
## price -2.775e-02 2.941e-03 -9.436 < 2e-16 ***
## trend -1.332e-03 6.017e-04 -2.213 0.027555 *
## favored_count 7.305e-04 2.617e-04 2.791 0.005537 **
## category_sold 1.614e-03 1.412e-04 11.427 < 2e-16 ***
## category_brand_sold 2.612e-05 7.666e-06 3.407 0.000733 ***
## lag_1 2.306e-01 3.921e-02 5.881 9.47e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2909 on 353 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8281, Adjusted R-squared: 0.8198
## F-statistic: 100 on 17 and 353 DF, p-value: < 2.2e-16
lm_model <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1 + lag_2, data = maproducts)
summary(lm_model)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1 + lag_2, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4022 -0.1343 0.0095 0.1755 0.7167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.411e+00 9.285e-01 10.135 < 2e-16 ***
## as.factor(month)2 -9.267e-02 7.502e-02 -1.235 0.2175
## as.factor(month)3 1.245e-01 7.881e-02 1.579 0.1152
## as.factor(month)4 -1.070e-01 8.994e-02 -1.190 0.2349
## as.factor(month)5 -1.745e-02 7.655e-02 -0.228 0.8198
## as.factor(month)6 -3.005e-01 1.180e-01 -2.547 0.0113 *
## as.factor(month)7 -5.614e-01 1.087e-01 -5.163 4.08e-07 ***
## as.factor(month)8 -6.576e-01 1.086e-01 -6.056 3.60e-09 ***
## as.factor(month)9 -1.625e-01 1.067e-01 -1.523 0.1288
## as.factor(month)10 2.231e-01 9.927e-02 2.247 0.0252 *
## as.factor(month)11 2.481e-01 1.012e-01 2.452 0.0147 *
## as.factor(month)12 1.304e-01 7.830e-02 1.666 0.0967 .
## price -2.642e-02 2.884e-03 -9.162 < 2e-16 ***
## trend -1.434e-03 5.887e-04 -2.436 0.0153 *
## favored_count 8.169e-04 2.568e-04 3.181 0.0016 **
## category_sold 1.794e-03 1.436e-04 12.488 < 2e-16 ***
## category_brand_sold 1.638e-05 7.789e-06 2.102 0.0362 *
## lag_1 2.120e-01 4.535e-02 4.673 4.23e-06 ***
## lag_2 2.640e-02 3.872e-02 0.682 0.4958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2838 on 351 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8364, Adjusted R-squared: 0.8281
## F-statistic: 99.72 on 18 and 351 DF, p-value: < 2.2e-16
That modification gives us better model with respect to r-squared value.Our new best model has formula that is ‘log_sold ~ as.factor(month) + price + trend + favored_count + category_sold + category_brand_sold + lag_1 + lag_2’.
That is quite developing iteration,too.From the first view of logarithm of sold products,we may predict outliers that breaks our models’ goodness.Let us put them into model.
maproducts[, `:=`(residuals, NA)]
maproducts$residuals[2:372] <- lm_model$residuals
## Warning in maproducts$residuals[2:372] <- lm_model$residuals: number of items to
## replace is not a multiple of replacement length
maproducts[!is.na(residuals), `:=`(quant5, quantile(residuals,
0.05))]
maproducts[!is.na(residuals), `:=`(quant95, quantile(residuals,
0.95))]
maproducts[, `:=`(outlier_small, as.numeric(residuals < quant5))]
maproducts[, `:=`(outlier_great, as.numeric(residuals > quant95))]
lm_model2 <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1 + lag_2 + outlier_small + outlier_great, data = maproducts)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small +
## outlier_great, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24210 -0.14328 0.01365 0.17803 0.70603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.203e+00 9.238e-01 9.963 < 2e-16 ***
## as.factor(month)2 -7.018e-02 7.481e-02 -0.938 0.34885
## as.factor(month)3 1.379e-01 7.846e-02 1.758 0.07968 .
## as.factor(month)4 -8.774e-02 8.952e-02 -0.980 0.32768
## as.factor(month)5 1.394e-03 7.634e-02 0.018 0.98544
## as.factor(month)6 -2.991e-01 1.170e-01 -2.556 0.01100 *
## as.factor(month)7 -5.438e-01 1.081e-01 -5.033 7.75e-07 ***
## as.factor(month)8 -6.237e-01 1.083e-01 -5.757 1.88e-08 ***
## as.factor(month)9 -1.371e-01 1.062e-01 -1.290 0.19774
## as.factor(month)10 2.311e-01 9.863e-02 2.343 0.01969 *
## as.factor(month)11 2.564e-01 1.006e-01 2.549 0.01123 *
## as.factor(month)12 1.436e-01 7.795e-02 1.843 0.06622 .
## price -2.576e-02 2.870e-03 -8.976 < 2e-16 ***
## trend -1.250e-03 5.873e-04 -2.128 0.03408 *
## favored_count 7.841e-04 2.550e-04 3.075 0.00227 **
## category_sold 1.810e-03 1.425e-04 12.696 < 2e-16 ***
## category_brand_sold 1.805e-05 7.762e-06 2.325 0.02064 *
## lag_1 2.095e-01 4.498e-02 4.658 4.54e-06 ***
## lag_2 2.644e-02 3.840e-02 0.689 0.49156
## outlier_small -1.946e-01 7.018e-02 -2.773 0.00584 **
## outlier_great -5.484e-02 6.806e-02 -0.806 0.42094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2813 on 349 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8401, Adjusted R-squared: 0.831
## F-statistic: 91.7 on 20 and 349 DF, p-value: < 2.2e-16
lm_model2 <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1 + lag_2 + outlier_small, data = maproducts)
summary(lm_model2)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24046 -0.14048 0.01292 0.17476 0.71152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.235e+00 9.225e-01 10.012 < 2e-16 ***
## as.factor(month)2 -7.136e-02 7.476e-02 -0.955 0.34043
## as.factor(month)3 1.408e-01 7.833e-02 1.798 0.07308 .
## as.factor(month)4 -8.630e-02 8.946e-02 -0.965 0.33536
## as.factor(month)5 3.586e-03 7.625e-02 0.047 0.96252
## as.factor(month)6 -2.968e-01 1.169e-01 -2.539 0.01156 *
## as.factor(month)7 -5.479e-01 1.079e-01 -5.079 6.18e-07 ***
## as.factor(month)8 -6.266e-01 1.082e-01 -5.790 1.57e-08 ***
## as.factor(month)9 -1.382e-01 1.062e-01 -1.302 0.19378
## as.factor(month)10 2.348e-01 9.847e-02 2.384 0.01764 *
## as.factor(month)11 2.606e-01 1.004e-01 2.595 0.00985 **
## as.factor(month)12 1.465e-01 7.782e-02 1.883 0.06051 .
## price -2.589e-02 2.864e-03 -9.039 < 2e-16 ***
## trend -1.264e-03 5.868e-04 -2.154 0.03193 *
## favored_count 7.809e-04 2.548e-04 3.065 0.00235 **
## category_sold 1.808e-03 1.425e-04 12.693 < 2e-16 ***
## category_brand_sold 1.830e-05 7.752e-06 2.361 0.01876 *
## lag_1 2.102e-01 4.495e-02 4.676 4.19e-06 ***
## lag_2 2.580e-02 3.837e-02 0.672 0.50175
## outlier_small -1.902e-01 6.993e-02 -2.720 0.00686 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2812 on 350 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8398, Adjusted R-squared: 0.8311
## F-statistic: 96.58 on 19 and 350 DF, p-value: < 2.2e-16
As we expected, outliers help our model to predict better. In this model,we can add week variable to model to examine its effectiveness in the model.
maproducts[, `:=`(weeks, week(event_date))]
head(maproducts)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 7061886 294.4167 24 1485 81
## 2: 2021-05-30 7061886 299.0000 11 1362 62
## 3: 2021-05-29 7061886 297.5714 14 1340 65
## 4: 2021-05-28 7061886 297.8889 9 972 62
## 5: 2021-05-27 7061886 299.0000 8 965 50
## 6: 2021-05-26 7061886 299.0000 12 1098 63
## favored_count category_sold category_visits category_basket category_favored
## 1: 122 951 95645 4704 8886
## 2: 100 810 92899 3988 8050
## 3: 123 790 91368 4386 7491
## 4: 76 84 705 70347 2987
## 5: 71 101 767 72503 3399
## 6: 71 104 785 71162 3540
## category_brand_sold ty_visits month day trend log_sold lag_1 lag_2
## 1: 179 125439876 5 2 1 3.178054 NA NA
## 2: 119 131821083 5 1 2 2.397895 3.178054 NA
## 3: 112 129670029 5 7 3 2.639057 2.397895 3.178054
## 4: 6453 103514886 5 6 4 2.197225 2.639057 2.397895
## 5: 6539 107391579 5 5 5 2.079442 2.197225 2.639057
## 6: 5162 106195988 5 4 6 2.484907 2.079442 2.197225
## residuals quant5 quant95 outlier_small outlier_great weeks
## 1: NA NA NA NA NA 22
## 2: -0.9995279 -0.4841688 0.3882731 1 0 22
## 3: -0.2609915 -0.4841688 0.3882731 0 0 22
## 4: -0.2885245 -0.4841688 0.3882731 0 0 22
## 5: 0.1721715 -0.4841688 0.3882731 0 0 21
## 6: 0.2357392 -0.4841688 0.3882731 0 0 21
lm_model3 <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1 + lag_2 + outlier_small + weeks, data = maproducts)
summary(lm_model3)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small +
## weeks, data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24886 -0.14436 0.00757 0.17470 0.70989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.266e+00 9.295e-01 9.969 < 2e-16 ***
## as.factor(month)2 -5.729e-02 8.865e-02 -0.646 0.51853
## as.factor(month)3 1.697e-01 1.252e-01 1.355 0.17614
## as.factor(month)4 -4.210e-02 1.740e-01 -0.242 0.80900
## as.factor(month)5 6.359e-02 2.165e-01 0.294 0.76916
## as.factor(month)6 -2.230e-01 2.755e-01 -0.809 0.41884
## as.factor(month)7 -4.601e-01 3.157e-01 -1.457 0.14596
## as.factor(month)8 -5.237e-01 3.640e-01 -1.439 0.15109
## as.factor(month)9 -1.941e-02 4.150e-01 -0.047 0.96273
## as.factor(month)10 3.693e-01 4.646e-01 0.795 0.42725
## as.factor(month)11 4.102e-01 5.151e-01 0.796 0.42642
## as.factor(month)12 3.126e-01 5.660e-01 0.552 0.58110
## price -2.594e-02 2.872e-03 -9.030 < 2e-16 ***
## trend -1.275e-03 5.887e-04 -2.165 0.03105 *
## favored_count 7.793e-04 2.552e-04 3.053 0.00244 **
## category_sold 1.809e-03 1.427e-04 12.680 < 2e-16 ***
## category_brand_sold 1.811e-05 7.790e-06 2.324 0.02069 *
## lag_1 2.091e-01 4.514e-02 4.633 5.09e-06 ***
## lag_2 2.515e-02 3.849e-02 0.653 0.51391
## outlier_small -1.896e-01 7.005e-02 -2.707 0.00713 **
## weeks -3.450e-03 1.165e-02 -0.296 0.76728
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2816 on 349 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8399, Adjusted R-squared: 0.8307
## F-statistic: 91.52 on 20 and 349 DF, p-value: < 2.2e-16
# take that of
lm_model2_3 <- lm(formula = log_sold ~ as.factor(month) + price +
trend + favored_count + category_sold + category_brand_sold +
lag_1 + lag_2 + outlier_small, data = maproducts)
summary(lm_model2_3)
##
## Call:
## lm(formula = log_sold ~ as.factor(month) + price + trend + favored_count +
## category_sold + category_brand_sold + lag_1 + lag_2 + outlier_small,
## data = maproducts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24046 -0.14048 0.01292 0.17476 0.71152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.235e+00 9.225e-01 10.012 < 2e-16 ***
## as.factor(month)2 -7.136e-02 7.476e-02 -0.955 0.34043
## as.factor(month)3 1.408e-01 7.833e-02 1.798 0.07308 .
## as.factor(month)4 -8.630e-02 8.946e-02 -0.965 0.33536
## as.factor(month)5 3.586e-03 7.625e-02 0.047 0.96252
## as.factor(month)6 -2.968e-01 1.169e-01 -2.539 0.01156 *
## as.factor(month)7 -5.479e-01 1.079e-01 -5.079 6.18e-07 ***
## as.factor(month)8 -6.266e-01 1.082e-01 -5.790 1.57e-08 ***
## as.factor(month)9 -1.382e-01 1.062e-01 -1.302 0.19378
## as.factor(month)10 2.348e-01 9.847e-02 2.384 0.01764 *
## as.factor(month)11 2.606e-01 1.004e-01 2.595 0.00985 **
## as.factor(month)12 1.465e-01 7.782e-02 1.883 0.06051 .
## price -2.589e-02 2.864e-03 -9.039 < 2e-16 ***
## trend -1.264e-03 5.868e-04 -2.154 0.03193 *
## favored_count 7.809e-04 2.548e-04 3.065 0.00235 **
## category_sold 1.808e-03 1.425e-04 12.693 < 2e-16 ***
## category_brand_sold 1.830e-05 7.752e-06 2.361 0.01876 *
## lag_1 2.102e-01 4.495e-02 4.676 4.19e-06 ***
## lag_2 2.580e-02 3.837e-02 0.672 0.50175
## outlier_small -1.902e-01 6.993e-02 -2.720 0.00686 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2812 on 350 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.8398, Adjusted R-squared: 0.8311
## F-statistic: 96.58 on 19 and 350 DF, p-value: < 2.2e-16
For another perspective AIC values of models.
require(ursa)
AIC(lm_model)
## [1] 138.3645
AIC(lm_model2_3)
## [1] 132.6254
AIC(lm_model3)
## [1] 134.5325
That proves claim of model with month as factor variable.
Now,It is time for us to come up with arima models.Firstly,we build time series with frequency=7.Plot that.
# time series and arima
ts_sold <- ts(maproducts$log_sold, frequency = 7)
ts.plot(ts_sold)
Box.test(ts_sold, lag = 7, type = "Ljung-Box")
##
## Box-Ljung test
##
## data: ts_sold
## X-squared = 856.91, df = 7, p-value < 2.2e-16
Variance is stable enough thanks to taking logarithm of data.Then, decompose time series data with additive type. Plot decomposed data
sold_decomp <- decompose(ts_sold, type = "additive")
plot(sold_decomp)
Random part is kind of stationary except some outlier points. We shall observe autocorrelation and partial autocorrelation of random part.
random = sold_decomp$random
# random part looks really fine.
acf(sold_decomp$random, na.action = na.pass)
pacf(sold_decomp$random, na.action = na.pass)
As we observe from those graphs,we start building arima models with (p,d,q)=(2,0,0) and increase p value to 10 one by one.After this process,we find model with auto arima function. Pick the best model.
ar_model <- arima(random, order = c(2, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(2, 0, 0))
##
## Coefficients:
## ar1 ar2 intercept
## 0.1652 -0.1504 -0.0003
## s.e. 0.0517 0.0517 0.0153
##
## sigma^2 estimated as 0.08328: log likelihood = -64.51, aic = 137.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -9.192964e-05 0.2885835 0.2252879 140.0453 164.2215 0.7462189
## ACF1
## Training set -0.04263476
ar_model <- arima(random, order = c(3, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(3, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 intercept
## 0.1223 -0.1031 -0.2901 -0.0003
## s.e. 0.0500 0.0501 0.0503 0.0114
##
## sigma^2 estimated as 0.07628: log likelihood = -48.58, aic = 107.17
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004172213 0.2761956 0.2117842 73.38254 196.0252 0.701491
## ACF1
## Training set -0.03808026
ar_model <- arima(random, order = c(4, 0, 0))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(4, 0, 0))
##
## Coefficients:
## ar1 ar2 ar3 ar4 intercept
## 0.0837 -0.1163 -0.2748 -0.1341 -4e-04
## s.e. 0.0517 0.0499 0.0502 0.0521 1e-02
##
## sigma^2 estimated as 0.07491: log likelihood = -45.3, aic = 102.6
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004704353 0.2736996 0.2094488 67.30758 183.3832 0.6937554
## ACF1
## Training set -0.03024761
ar_model <- arima(random, order = c(0, 0, 1))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.1664 -0.0004
## s.e. 0.0510 0.0178
##
## sigma^2 estimated as 0.08482: log likelihood = -67.85, aic = 141.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 4.311051e-05 0.2912457 0.2276016 106.9205 132.293 0.7538828
## ACF1
## Training set -0.005386586
ar_model <- arima(random, order = c(0, 0, 2))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## 0.1029 -0.0909 -0.0003
## s.e. 0.0871 0.0971 0.0154
##
## sigma^2 estimated as 0.08458: log likelihood = -67.33, aic = 142.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.257592e-05 0.2908307 0.2280883 129.2621 144.3999 0.7554947
## ACF1
## Training set 0.02991958
# THAT IS THE ONE
ar_model9 <- arima(random, order = c(0, 0, 3))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(0, 0, 2))
##
## Coefficients:
## ma1 ma2 intercept
## 0.1029 -0.0909 -0.0003
## s.e. 0.0871 0.0971 0.0154
##
## sigma^2 estimated as 0.08458: log likelihood = -67.33, aic = 142.66
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.257592e-05 0.2908307 0.2280883 129.2621 144.3999 0.7554947
## ACF1
## Training set 0.02991958
ar_model <- arima(random, order = c(0, 0, 4))
summary(ar_model)
##
## Call:
## arima(x = random, order = c(0, 0, 4))
##
## Coefficients:
## ma1 ma2 ma3 ma4 intercept
## -0.1232 -0.3761 -0.4221 -0.0786 1e-04
## s.e. 0.0513 0.0508 0.0451 0.0553 3e-04
##
## sigma^2 estimated as 0.06204: log likelihood = -13.09, aic = 38.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.000978949 0.2490877 0.1922267 84.52326 221.7878 0.6367108
## ACF1
## Training set 0.004184797
Best model we have above is the model with (p,d,q)=(0,0,3).
To compare lm and arima models, we should compare them for test period which is from 2021-05-11 to 2021-05-29.
train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")
test_dates = seq(test_start, test_end, by = "day")
test_dates
## [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
## [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"
# forecast with lm model
forecast_with_lr = function(fmla, data, forecast_data) {
fitted_lm = lm(as.formula(fmla), data)
forecasted = predict(fitted_lm, forecast_data)
return(list(forecast = as.numeric(forecasted), model = fitted_lm))
}
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "log_sold",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(9, 0, 0))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
We define our functions with respect to parameters we found above.Then,we forecast for test dates with our lm and arima models separately.
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = maproducts[event_date <= current_date]
forecast_data = maproducts[event_date == test_dates[i]]
# first lm models
fmla = "log_sold ~ as.factor(month) + price + category_sold +
lag_1 + lag_2 + outlier_great"
forecasted = forecast_with_lr(fmla, past_data, forecast_data)
forecast_data[, `:=`(lm_prediction, forecasted$forecast)]
# arima model with auto.arima
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"log_sold", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$log_sold"
## [1] "2021-05-12"
## [1] "input_series=data$log_sold"
## [1] "2021-05-13"
## [1] "input_series=data$log_sold"
## [1] "2021-05-14"
## [1] "input_series=data$log_sold"
## [1] "2021-05-15"
## [1] "input_series=data$log_sold"
## [1] "2021-05-16"
## [1] "input_series=data$log_sold"
## [1] "2021-05-17"
## [1] "input_series=data$log_sold"
## [1] "2021-05-18"
## [1] "input_series=data$log_sold"
## [1] "2021-05-19"
## [1] "input_series=data$log_sold"
## [1] "2021-05-20"
## [1] "input_series=data$log_sold"
## [1] "2021-05-21"
## [1] "input_series=data$log_sold"
## [1] "2021-05-22"
## [1] "input_series=data$log_sold"
## [1] "2021-05-23"
## [1] "input_series=data$log_sold"
## [1] "2021-05-24"
## [1] "input_series=data$log_sold"
## [1] "2021-05-25"
## [1] "input_series=data$log_sold"
## [1] "2021-05-26"
## [1] "input_series=data$log_sold"
## [1] "2021-05-27"
## [1] "input_series=data$log_sold"
## [1] "2021-05-28"
## [1] "input_series=data$log_sold"
## [1] "2021-05-29"
## [1] "input_series=data$log_sold"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "log_sold"),
c("lm_prediction", "arima_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date log_sold variable value
## 1: 2021-05-11 3.433987 lm_prediction 3.026657
## 2: 2021-05-12 2.995732 lm_prediction 3.036444
## 3: 2021-05-13 2.772589 lm_prediction 3.112824
## 4: 2021-05-14 3.555348 lm_prediction 3.113620
## 5: 2021-05-15 3.135494 lm_prediction 3.040228
## 6: 2021-05-16 3.258097 lm_prediction 3.071004
## 7: 2021-05-17 3.091042 lm_prediction 2.953569
## 8: 2021-05-18 2.944439 lm_prediction 2.989225
## 9: 2021-05-19 2.995732 lm_prediction 2.900105
## 10: 2021-05-20 2.833213 lm_prediction 2.651231
## 11: 2021-05-21 2.484907 lm_prediction 2.707107
## 12: 2021-05-22 2.397895 lm_prediction 2.711366
## 13: 2021-05-23 2.833213 lm_prediction 2.556579
## 14: 2021-05-24 1.945910 lm_prediction 2.662905
## 15: 2021-05-25 2.772589 lm_prediction 2.607640
## 16: 2021-05-26 2.484907 lm_prediction 2.446789
## 17: 2021-05-27 2.079442 lm_prediction 2.497629
## 18: 2021-05-28 2.197225 lm_prediction 2.549871
## 19: 2021-05-29 2.639057 lm_prediction 4.101142
## 20: 2021-05-11 3.433987 arima_prediction 3.753423
## 21: 2021-05-12 2.995732 arima_prediction 3.752491
## 22: 2021-05-13 2.772589 arima_prediction 3.750498
## 23: 2021-05-14 3.555348 arima_prediction 3.749736
## 24: 2021-05-15 3.135494 arima_prediction 3.751519
## 25: 2021-05-16 3.258097 arima_prediction 3.748914
## 26: 2021-05-17 3.091042 arima_prediction 3.748967
## 27: 2021-05-18 2.944439 arima_prediction 3.748417
## 28: 2021-05-19 2.995732 arima_prediction 3.748331
## 29: 2021-05-20 2.833213 arima_prediction 3.747522
## 30: 2021-05-21 2.484907 arima_prediction 3.745967
## 31: 2021-05-22 2.397895 arima_prediction 3.742094
## 32: 2021-05-23 2.833213 arima_prediction 3.742548
## 33: 2021-05-24 1.945910 arima_prediction 3.743295
## 34: 2021-05-25 2.772589 arima_prediction 3.743557
## 35: 2021-05-26 2.484907 arima_prediction 3.743530
## 36: 2021-05-27 2.079442 arima_prediction 3.741925
## 37: 2021-05-28 2.197225 arima_prediction 3.743136
## 38: 2021-05-29 2.639057 arima_prediction 3.743903
## event_date log_sold variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(log_sold, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: lm_prediction 19 2.781622 0.4392261 0.1579029 -0.03566864 0.1210269
## 2: arima_prediction 19 2.781622 0.4392261 0.1579029 -0.34699471 0.3816355
## RMSE MAD MADP WMAPE
## 1: 0.4465979 0.3125010 0.1123449 0.1123449
## 2: 1.0544824 0.9652081 0.3469947 0.3469947
First view of data shows that lm prediction is better in terms of MAPE.However,let us see boxplot MAPE and FBias with respect to days of a week.
performance = melted_result[, accu(log_sold, value), by = list(event_date,
variable)]
performance[, `:=`(day_of_week, wday(event_date, label = T))]
ggplot(performance, aes(x = day_of_week, y = MAPE, fill = variable)) +
geom_boxplot()
ggplot(performance, aes(x = day_of_week, y = FBias, fill = variable)) +
geom_boxplot()
As we observe from table,MAPE values of lm prediction is less.They both tend to predict less.However,lm is better than arima in terms of FBias ,too.
So,we forecast with our lm model.Before that,we forecast our parameters by building arima model with auto arima function.
price_ts <- ts(maproducts$price, frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 1812.927
price_model_forecast <- predict(price_model, n.ahead = 15)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)],
1)
price_model_forecast = price_model_forecast + last_trend_value +
seasonality
price_model_forecast
## Time Series:
## Start = c(54, 2)
## End = c(56, 2)
## Frequency = 7
## [1] 235.8121 235.7070 235.8431 236.0378 235.9613 235.9003 235.9003 235.9003
## [9] 235.9003 235.9003 235.9003 235.9003 235.9003 235.9003 235.9003
model_cat <- auto.arima(maproducts$category_sold)
model_fav <- auto.arima(maproducts$favored_count)
fav_fcast <- predict(model_fav, n.ahead = 15)$pred
cat_fcast <- predict(model_cat, n.ahead = 15)$pred
model_lag_1 <- auto.arima(maproducts$lag_1)
lag1_fcast <- predict(model_lag_1, n.ahead = 15)$pred
model_lag_2 <- auto.arima(maproducts$lag_2)
lag2_fcast <- predict(model_lag_2, n.ahead = 15)$pred
model_brand <- auto.arima(maproducts$category_brand_sold)
brand_fcast <- predict(model_brand, n.ahead = 15)$pred
model_out <- auto.arima(maproducts$outlier_small)
small_fcast <- predict(model_out, n.ahead = 15)$pred
Now,we forecast our value with our forecasted parameters.
forecast_Log <- predict(lm_model2_3, data.frame(month = as.factor(6),
price = price_model_forecast[1], trend = 373, favored_count = fav_fcast[1],
category_sold = cat_fcast[1], category_brand_sold = brand_fcast[1],
lag_1 = lag1_fcast[1], lag_2 = lag2_fcast[1], outlier_small = small_fcast[1]))
fcast3 <- exp(forecast_Log)
fcast3
## 1
## 29.69554
data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
daily_data = fread(data_path)
str(daily_data)
## Classes 'data.table' and 'data.frame': 4331 obs. of 13 variables:
## $ event_date : IDate, format: "2021-05-31" "2021-05-31" ...
## $ product_content_id : int 85004 4066298 6676673 7061886 31515569 32737302 32939029 48740784 73318567 85004 ...
## $ price : num 87.4 65.8 120.7 294.4 60.3 ...
## $ sold_count : int 80 1398 345 24 454 44 125 2 175 85 ...
## $ visit_count : int 6002 19102 19578 1485 20114 5017 5301 221 20256 5265 ...
## $ basket_count : int 744 5096 1261 81 2871 316 608 15 965 633 ...
## $ favored_count : int 1642 1703 1510 122 2102 607 698 22 1960 1104 ...
## $ category_sold : int 5048 6547 4944 951 8155 5198 930 1684 5198 4117 ...
## $ category_visits : int 236197 108811 306462 95645 637143 1010634 41973 329087 1010634 215260 ...
## $ category_basket : int 33681 28558 23418 4704 49389 33728 3911 12614 33728 25181 ...
## $ category_favored : int 40472 11913 26597 8886 62460 96699 5791 24534 96699 36225 ...
## $ category_brand_sold: int 743 4286 786 179 1759 3665 875 12 3665 430 ...
## $ ty_visits : int 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 131821083 ...
## - attr(*, ".internal.selfref")=<externalptr>
When we look at the sales graph of product 1(‘mont’) , we see that there is no sales in some periods.At first,I thought the reason for this was that the store in trendyol had removed product from the aisle.However,then when I saw that the product was added to the basket and favorites during these non-sale periods, I realized that the reason for this was seasonality.
daily_data$event_date <- as.Date(daily_data$event_date, "%d.%m.%Y")
product1_data <- daily_data[product_content_id == 48740784]
ggplot(product1_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
If we see the sales period more closely:
ggplot(product1_data[as.Date(event_date) <= "2021-01-01" & as.Date(event_date) >=
"2020-10-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
We established both arima and linear regression models for the products.Then we compared these forecasting models and chose the best one.At first, we built a linear regression model using all predictors for product1.
product1_reg <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_visits + category_basket +
category_favored + category_brand_sold + ty_visits, product1_data)
summary(product1_reg)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_visits + category_basket +
## category_favored + category_brand_sold + ty_visits, data = product1_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5381 -1.3319 -0.4026 1.2313 7.7383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.320e+00 2.276e+00 2.337 0.0229 *
## price -3.457e-03 2.818e-03 -1.227 0.2250
## visit_count -1.698e-03 1.968e-02 -0.086 0.9315
## basket_count 1.990e-01 1.006e-02 19.787 < 2e-16 ***
## favored_count -5.923e-02 1.714e-01 -0.346 0.7309
## category_sold 1.232e-03 1.075e-03 1.146 0.2564
## category_visits -2.694e-06 1.126e-05 -0.239 0.8118
## category_basket 1.399e-06 1.506e-06 0.929 0.3568
## category_favored -6.610e-05 1.193e-05 -5.540 7.72e-07 ***
## category_brand_sold 9.788e-06 8.014e-06 1.221 0.2269
## ty_visits -2.134e-08 1.171e-08 -1.822 0.0736 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.611 on 58 degrees of freedom
## (303 observations deleted due to missingness)
## Multiple R-squared: 0.8873, Adjusted R-squared: 0.8679
## F-statistic: 45.66 on 10 and 58 DF, p-value: < 2.2e-16
We’ve seen that basket_count and category_favored are effective predictors.The adjusted R-squared value was also high.The p value of the f test is fine.
product1_reg <- lm(sold_count ~ price + basket_count + category_sold +
category_basket + category_favored + category_brand_sold +
ty_visits, product1_data)
summary(product1_reg)
##
## Call:
## lm(formula = sold_count ~ price + basket_count + category_sold +
## category_basket + category_favored + category_brand_sold +
## ty_visits, data = product1_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.4637 -1.3676 -0.4079 1.2025 7.6485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.207e+00 2.213e+00 2.353 0.0219 *
## price -3.432e-03 2.745e-03 -1.250 0.2160
## basket_count 1.981e-01 9.856e-03 20.102 < 2e-16 ***
## category_sold 6.619e-04 6.786e-04 0.975 0.3332
## category_basket 9.669e-07 9.100e-07 1.062 0.2922
## category_favored -6.118e-05 1.041e-05 -5.878 1.87e-07 ***
## category_brand_sold 7.548e-06 6.904e-06 1.093 0.2786
## ty_visits -2.451e-08 1.021e-08 -2.401 0.0194 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.57 on 61 degrees of freedom
## (303 observations deleted due to missingness)
## Multiple R-squared: 0.8852, Adjusted R-squared: 0.8721
## F-statistic: 67.21 on 7 and 61 DF, p-value: < 2.2e-16
We further increased the adjusted R-squared value by removing redundant predictors. We found the forecast values in the train period with the linear model we built and we printed the forecast values over the actual values on the chart.
product1_data[, `:=`(forecast1, predict(product1_reg, product1_data))]
ggplot(product1_data, aes(x = event_date)) + geom_line(aes(y = sold_count,
color = "real"), size = 1.5) + geom_line(aes(y = forecast1,
color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 127 row(s) containing missing values (geom_path).
We added a daytype column to the product 1 data table to find out if there is daily seasonality in the sales data.The daytype variable returns from 1 to 7 for days.We also added daytype to the linear model.
day_type <- read_excel("/Users/onurcanaydin/Downloads/daytype.xlsx",
range = "C1:C372", col_names = FALSE)
## New names:
## * `` -> ...1
product1_data <- cbind(product1_data, day_type)
names(product1_data)[15] <- "day_type"
product1_reg <- lm(sold_count ~ price + basket_count + category_sold +
category_basket + category_favored + category_brand_sold +
ty_visits + as.factor(day_type), product1_data)
summary(product1_reg)
##
## Call:
## lm(formula = sold_count ~ price + basket_count + category_sold +
## category_basket + category_favored + category_brand_sold +
## ty_visits + as.factor(day_type), data = product1_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9503 -1.0827 -0.3769 1.2563 6.7710
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.247e+00 2.317e+00 2.265 0.0275 *
## price -2.925e-03 2.823e-03 -1.036 0.3046
## basket_count 1.964e-01 1.019e-02 19.277 < 2e-16 ***
## category_sold 5.971e-04 6.931e-04 0.862 0.3927
## category_basket 1.092e-06 9.473e-07 1.153 0.2540
## category_favored -5.925e-05 1.063e-05 -5.572 7.83e-07 ***
## category_brand_sold 8.041e-06 7.055e-06 1.140 0.2593
## ty_visits -2.212e-08 1.040e-08 -2.126 0.0380 *
## as.factor(day_type)2 -1.768e+00 1.236e+00 -1.430 0.1584
## as.factor(day_type)3 -1.196e+00 1.290e+00 -0.927 0.3578
## as.factor(day_type)4 -7.771e-01 1.157e+00 -0.672 0.5045
## as.factor(day_type)5 4.665e-01 1.222e+00 0.382 0.7040
## as.factor(day_type)6 3.529e-01 1.193e+00 0.296 0.7684
## as.factor(day_type)7 -1.021e+00 1.218e+00 -0.838 0.4056
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.569 on 55 degrees of freedom
## (303 observations deleted due to missingness)
## Multiple R-squared: 0.8965, Adjusted R-squared: 0.8721
## F-statistic: 36.66 on 13 and 55 DF, p-value: < 2.2e-16
AIC(product1_reg)
## [1] 340.3843
The daytype regressor had no significant effect on the model.Therefore, we removed the back daytype from the model.
product1_reg <- lm(sold_count ~ price + basket_count + category_basket +
category_favored + ty_visits, product1_data)
summary(product1_reg)
##
## Call:
## lm(formula = sold_count ~ price + basket_count + category_basket +
## category_favored + ty_visits, data = product1_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.675 -1.334 -0.380 1.095 7.449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.988e+00 2.088e+00 2.868 0.00561 **
## price -4.417e-03 2.625e-03 -1.683 0.09742 .
## basket_count 1.945e-01 9.330e-03 20.841 < 2e-16 ***
## category_basket 1.259e-06 7.434e-07 1.693 0.09532 .
## category_favored -5.345e-05 7.502e-06 -7.124 1.21e-09 ***
## ty_visits -2.585e-08 9.533e-09 -2.711 0.00863 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.565 on 63 degrees of freedom
## (303 observations deleted due to missingness)
## Multiple R-squared: 0.8819, Adjusted R-squared: 0.8725
## F-statistic: 94.1 on 5 and 63 DF, p-value: < 2.2e-16
AIC(product1_reg)
## [1] 333.5134
When we look at the graph, the forecast and the actual values overlap in general, only the forecast graph was cut in some periods, this is because the value of the price regressor in our linear regression model was not given in some periods in the excel data that given to us.
product1_data[, `:=`(forecast2, predict(product1_reg, product1_data))]
ggplot(product1_data, aes(x = event_date)) + geom_line(aes(y = sold_count,
color = "real"), size = 1.5) + geom_line(aes(y = forecast2,
color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 127 row(s) containing missing values (geom_path).
If we see the sales period more closely:
ggplot(product1_data[as.Date(event_date) <= "2021-01-01" & as.Date(event_date) >=
"2020-10-01"], aes(x = event_date)) + geom_line(aes(y = sold_count,
color = "real"), size = 1.5) + geom_line(aes(y = forecast2,
color = "predicted"), size = 1.5) + theme_dark()
## Warning: Removed 17 row(s) containing missing values (geom_path).
We did decomposition at daily level for product1.Then, we build arima model with auto arima function.
datats <- ts(product1_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
ts_decomposed <- decompose(x = datats, type = "additive")
model = auto.arima(ts_decomposed$random, max.p = 2, max.q = 2)
AIC(model)
## [1] 12491.09
We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.
We build arima models by using past data for forecasting predictors or regressors of linear model of product 1.
we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.
ggplot(product1_data, aes(x = event_date, y = basket_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product1_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
basket_count_ts <- ts(product1_data$basket_count, start = as.Date("2021-05-04"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 829.9022
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 13.516082 16.281420 13.170164 9.096985 7.575588 10.531312 11.747101
## [8] 12.152816 11.341370 11.304996 10.528811 10.424621 11.352183 10.750536
## [15] 10.853123 11.315081
we build arima model with decomposing at daily level for category_basket regressor and forecast the value of category_basket on the desired day in the competition period.
ggplot(product1_data, aes(x = event_date, y = category_basket)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_basket_ts <- ts(product1_data$category_basket, start = as.Date("2021-02-10"),
end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random)
AIC(category_basket_model)
## [1] 20261.14
category_basket_model_forecast <- predict(category_basket_model,
n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)],
1)
category_basket_model_forecast = category_basket_model_forecast +
last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 535119.9 537626.4 575312.0 545347.8 508458.8 524560.0 545403.9 528680.5
## [9] 535887.3 573328.2 545817.4 509150.1 526335.6 544140.2 530285.0 536722.4
we build arima model with decomposing at daily level for Category_favored regressor and forecast the value of category_favored on the desired day in the competition period.
ggplot(product1_data, aes(x = event_date, y = category_favored)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
fitted = auto.arima(product1_data$category_favored)
c = forecast(fitted, h = 16)
category_favored_model_forecast = c$mean
category_favored_model_forecast
## Time Series:
## Start = 373
## End = 388
## Frequency = 1
## [1] 6975.930 6957.469 6828.073 6854.624 6955.408 6925.019 6847.408 6878.740
## [9] 6937.782 6907.394 6863.077 6891.366 6924.130 6898.571 6874.774 6897.335
ggplot(product1_data, aes(x = event_date, y = ty_visits)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
ty_visits_ts <- ts(product1_data$ty_visits, start = as.Date("2021-02-07"),
end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 27395.99
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 16)$pred
seasonality = ty_visits_dec$seasonal[1:16]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)],
1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value +
seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 119538618 120121585 127286838 125263590 119946681 118228548 120508962
## [8] 119625332 120422516 126742923 124677015 119907503 118593636 120600688
## [15] 119696340 120668941
We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.
predict(product1_reg, data.frame(price = 449.985, basket_count = basket_count_model_forecast[16],
category_basket = category_basket_model_forecast[16], category_favored = category_favored_model_forecast[16],
ty_visits = ty_visits_model_forecast[16]))
## 1
## 3.389179
When we look at the sales chart of product2, it is understood that it is a summer product.Sales volumes showed an upward trend in May.
product2_data <- daily_data[product_content_id == 73318567]
ggplot(product2_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
If we see the sales period more closely:
ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-04-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
We established both arima and linear regression models for the product 2.Then we compared these forecasting models and chose the best one.At first, we built a linear regression model using all predictors for product2.
product2_reg <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_visits + category_basket +
category_favored + category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_visits + category_basket +
## category_favored + category_brand_sold + ty_visits, data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.031 -6.321 0.987 5.517 22.830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.309e+02 5.817e+02 -0.397 0.69247
## price 3.951e+00 9.683e+00 0.408 0.68431
## visit_count -2.204e-03 1.065e-03 -2.069 0.04174 *
## basket_count 2.143e-01 1.522e-02 14.078 < 2e-16 ***
## favored_count 4.831e-03 8.174e-03 0.591 0.55614
## category_sold 1.396e-02 3.156e-03 4.423 3.04e-05 ***
## category_visits 9.219e-05 4.741e-05 1.945 0.05535 .
## category_basket 1.047e-04 3.738e-05 2.802 0.00637 **
## category_favored -1.513e-03 4.482e-04 -3.376 0.00114 **
## category_brand_sold -5.679e-04 2.864e-04 -1.983 0.05078 .
## ty_visits -1.190e-07 4.082e-08 -2.917 0.00459 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.916 on 80 degrees of freedom
## (281 observations deleted due to missingness)
## Multiple R-squared: 0.985, Adjusted R-squared: 0.9832
## F-statistic: 526.2 on 10 and 80 DF, p-value: < 2.2e-16
We’ve seen that basket_count and category_sold are most effective predictors.The adjusted R-squared value was also very high.The p value of the f test is fine.
product2_reg <- lm(sold_count ~ visit_count + basket_count +
favored_count + category_sold + category_visits + category_basket +
category_favored + category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ visit_count + basket_count + favored_count +
## category_sold + category_visits + category_basket + category_favored +
## category_brand_sold + ty_visits, data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.1677 -1.0210 -0.1672 1.2207 26.3600
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.973e-01 5.174e-01 1.348 0.17863
## visit_count -3.953e-04 5.202e-04 -0.760 0.44784
## basket_count 2.230e-01 7.330e-03 30.416 < 2e-16 ***
## favored_count -9.409e-03 3.283e-03 -2.866 0.00440 **
## category_sold 2.643e-03 6.653e-04 3.973 8.58e-05 ***
## category_visits 3.666e-06 7.381e-06 0.497 0.61972
## category_basket 5.862e-05 1.120e-05 5.233 2.84e-07 ***
## category_favored -2.487e-04 6.428e-05 -3.869 0.00013 ***
## category_brand_sold -2.455e-04 9.108e-05 -2.695 0.00736 **
## ty_visits -7.481e-08 9.687e-09 -7.722 1.13e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.552 on 362 degrees of freedom
## Multiple R-squared: 0.9863, Adjusted R-squared: 0.9859
## F-statistic: 2886 on 9 and 362 DF, p-value: < 2.2e-16
product2_reg <- lm(sold_count ~ visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits, product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ visit_count + basket_count + favored_count +
## category_sold + category_basket + category_favored + category_brand_sold +
## ty_visits, data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.3270 -0.9939 -0.1711 1.2007 26.2894
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.746e-01 4.542e-01 1.265 0.20664
## visit_count -3.430e-04 5.089e-04 -0.674 0.50076
## basket_count 2.238e-01 7.128e-03 31.396 < 2e-16 ***
## favored_count -9.772e-03 3.197e-03 -3.056 0.00241 **
## category_sold 2.586e-03 6.546e-04 3.950 9.38e-05 ***
## category_basket 5.595e-05 9.814e-06 5.701 2.47e-08 ***
## category_favored -2.291e-04 5.064e-05 -4.523 8.26e-06 ***
## category_brand_sold -2.343e-04 8.817e-05 -2.658 0.00822 **
## ty_visits -7.235e-08 8.326e-09 -8.690 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.547 on 363 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.9859
## F-statistic: 3253 on 8 and 363 DF, p-value: < 2.2e-16
product2_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_basket + category_favored + category_brand_sold +
ty_visits, product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_basket + category_favored + category_brand_sold +
## ty_visits, data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.2786 -0.9534 -0.1677 1.1699 25.5679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.598e-01 4.534e-01 1.235 0.217661
## basket_count 2.202e-01 4.800e-03 45.879 < 2e-16 ***
## favored_count -1.167e-02 1.516e-03 -7.695 1.34e-13 ***
## category_sold 2.553e-03 6.523e-04 3.914 0.000108 ***
## category_basket 5.476e-05 9.646e-06 5.677 2.81e-08 ***
## category_favored -2.279e-04 5.058e-05 -4.506 8.90e-06 ***
## category_brand_sold -2.217e-04 8.610e-05 -2.575 0.010414 *
## ty_visits -7.257e-08 8.313e-09 -8.729 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.542 on 364 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.986
## F-statistic: 3723 on 7 and 364 DF, p-value: < 2.2e-16
We further increased the adjusted R-squared value by removing redundant predictors.
product2_data <- cbind(product2_data, day_type)
names(product2_data)[14] <- "day_type"
product2_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_basket + category_favored + category_brand_sold +
ty_visits + as.factor(day_type), product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_basket + category_favored + category_brand_sold +
## ty_visits + as.factor(day_type), data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.3240 -1.1225 -0.0821 1.2715 25.3379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.751e-01 8.466e-01 -0.561 0.574981
## basket_count 2.203e-01 4.831e-03 45.611 < 2e-16 ***
## favored_count -1.174e-02 1.527e-03 -7.688 1.45e-13 ***
## category_sold 2.500e-03 6.646e-04 3.761 0.000198 ***
## category_basket 5.466e-05 9.709e-06 5.630 3.64e-08 ***
## category_favored -2.233e-04 5.161e-05 -4.326 1.98e-05 ***
## category_brand_sold -2.190e-04 8.672e-05 -2.525 0.011990 *
## ty_visits -7.301e-08 8.358e-09 -8.735 < 2e-16 ***
## as.factor(day_type)2 8.852e-01 1.080e+00 0.820 0.412832
## as.factor(day_type)3 1.475e+00 1.079e+00 1.366 0.172698
## as.factor(day_type)4 1.012e+00 1.080e+00 0.937 0.349198
## as.factor(day_type)5 8.642e-01 1.086e+00 0.796 0.426773
## as.factor(day_type)6 1.531e+00 1.083e+00 1.413 0.158421
## as.factor(day_type)7 1.473e+00 1.079e+00 1.365 0.173003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.565 on 358 degrees of freedom
## Multiple R-squared: 0.9863, Adjusted R-squared: 0.9858
## F-statistic: 1989 on 13 and 358 DF, p-value: < 2.2e-16
We added a daytype column to the product 1 data table to find out if there is daily seasonality in the sales data.The daytype variable returns from 1 to 7 for days.We also added daytype to the linear model.The daytype regressor had no significant effect on the model.Therefore, we removed the back daytype from the model.
product2_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_basket + category_favored + category_brand_sold +
ty_visits, product2_data)
summary(product2_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_basket + category_favored + category_brand_sold +
## ty_visits, data = product2_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.2786 -0.9534 -0.1677 1.1699 25.5679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.598e-01 4.534e-01 1.235 0.217661
## basket_count 2.202e-01 4.800e-03 45.879 < 2e-16 ***
## favored_count -1.167e-02 1.516e-03 -7.695 1.34e-13 ***
## category_sold 2.553e-03 6.523e-04 3.914 0.000108 ***
## category_basket 5.476e-05 9.646e-06 5.677 2.81e-08 ***
## category_favored -2.279e-04 5.058e-05 -4.506 8.90e-06 ***
## category_brand_sold -2.217e-04 8.610e-05 -2.575 0.010414 *
## ty_visits -7.257e-08 8.313e-09 -8.729 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.542 on 364 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.986
## F-statistic: 3723 on 7 and 364 DF, p-value: < 2.2e-16
AIC(product2_reg)
## [1] 2339.655
We did decomposition at daily level for product1.Then, we build arima model with auto arima function.
datats2 <- ts(product2_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
ts_dec_add <- decompose(x = datats2, type = "additive")
model2 = auto.arima(ts_dec_add$random, max.p = 2, max.q = 2)
AIC(model2)
## [1] 19849.58
We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.
We build arima models by using past data for forecasting predictors or regressors of linear model of product 2.
we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.
ggplot(product2_data, aes(x = event_date, y = basket_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
basket_count_ts <- ts(product2_data$basket_count, start = as.Date("2021-02-09"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 8716.526
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 456.4614 493.7593 540.5048 520.2998 482.9880 472.1964 464.4085 461.4595
## [9] 483.1937 516.1933 509.0330 486.0518 473.1542 466.6694 462.7477 480.3704
we build arima model with decomposing at daily level for favored_count regressor and forecast the value of favored_count on the desired day in the competition period.
ggplot(product2_data, aes(x = event_date, y = favored_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = favored_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
favored_count_ts <- ts(product2_data$favored_count, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random, max.p = 3,
max.q = 3)
AIC(favored_count_model)
## [1] 10299.58
favored_count_model_forecast <- predict(favored_count_model,
n.ahead = 16)$pred
seasonality = favored_count_dec$seasonal[1:16]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)],
1)
favored_count_model_forecast = favored_count_model_forecast +
last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 1824.777 2153.312 2681.863 2373.166 2044.357 2065.380 2141.401 2190.460
## [9] 2101.854 1930.458 2055.127 2111.052 2070.982 2071.007 2077.568 2113.272
we build arima model with decomposing at daily level for category_sold regressor and forecast the value of category_sold on the desired day in the competition period.
ggplot(product2_data, aes(x = event_date, y = category_sold)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = category_sold)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_sold_ts <- ts(product2_data$category_sold, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random, max.p = 3,
max.q = 3)
AIC(category_sold_model)
## [1] 10544.62
category_sold_model_forecast <- predict(category_sold_model,
n.ahead = 16)$pred
seasonality = category_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)],
1)
category_sold_model_forecast = category_sold_model_forecast +
last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 4055.494 4645.928 4668.572 4119.550 3755.913 3742.885 3994.022 4172.623
## [9] 4224.037 4143.844 3968.022 3947.163 4032.239 4118.171 4096.042 4068.097
ggplot(product2_data, aes(x = event_date, y = category_basket)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product2_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = category_basket)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_basket_ts <- ts(product2_data$category_basket, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random,
max.p = 2, max.q = 2)
AIC(category_basket_model)
## [1] 18077.75
category_basket_model_forecast <- predict(category_basket_model,
n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)],
1)
category_basket_model_forecast = category_basket_model_forecast +
last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 495831.4 344890.7 251235.0 241199.6 273443.1 340637.0 391791.4 409243.7
## [9] 418889.1 409840.0 387829.4 309074.0 279205.0 293899.9 333281.9 375688.2
fitted = auto.arima(product2_data$category_favored)
a = forecast(fitted, h = 16)
category_favored_model_forecast = a$mean
ggplot(product2_data, aes(x = event_date, y = category_brand_sold)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_brand_sold_ts <- ts(product2_data$category_brand_sold,
start = as.Date("2021-01-26"), end = as.Date("2021-05-31"),
frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts,
type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random,
max.p = 2, max.q = 2)
AIC(category_brand_sold_model)
## [1] 17870.52
category_brand_sold_model_forecast <- predict(category_brand_sold_model,
n.ahead = 16)$pred
seasonality = category_brand_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)],
1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast +
last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 11563.63 14974.64 15552.65 14983.35 12818.26 12058.24 11823.64 11867.48
## [9] 14118.44 16579.25 15880.05 12965.66 11394.18 11289.36 12795.02 15079.36
ggplot(product2_data, aes(x = event_date, y = ty_visits)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
ty_visits_ts <- ts(product2_data$ty_visits, start = as.Date("2021-01-31"),
end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 29151.66
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 16)$pred
seasonality = ty_visits_dec$seasonal[1:16]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)],
1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value +
seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 100795851 100711318 107932972 104924710 99275802 100100587 102194310
## [8] 101905062 101209260 105388726 103236585 99133886 100120419 102106750
## [15] 101767595 101365562
We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.
predict(product2_reg, data.frame(basket_count = basket_count_model_forecast[16],
favored_count = favored_count_model_forecast[16], category_sold = category_sold_model_forecast[16],
category_basket = category_basket_model_forecast[16], category_favored = category_favored_model_forecast[16],
category_brand_sold = category_brand_sold_model_forecast[16],
ty_visits = ty_visits_model_forecast[16]))
## 1
## 99.87471
Product3 is the product of the same category as product2.Sales increase during and before the summer months.
product3_data <- daily_data[product_content_id == 32737302]
ggplot(product3_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
If we see the sales period more closely:
ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-02-01"], aes(x = event_date, y = sold_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
We constructed a linear regression model for product3 using all predictors.
product3_reg <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_visits + category_basket +
category_favored + category_brand_sold + ty_visits, product3_data)
summary(product3_reg)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_visits + category_basket +
## category_favored + category_brand_sold + ty_visits, data = product3_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.5489 -3.7959 -0.5739 4.8847 17.9934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.528e+01 2.438e+01 1.447 0.15006
## price -5.575e-01 3.986e-01 -1.399 0.16412
## visit_count -1.673e-04 2.092e-03 -0.080 0.93637
## basket_count 1.720e-01 1.327e-02 12.958 < 2e-16 ***
## favored_count -7.974e-03 1.083e-02 -0.736 0.46280
## category_sold 3.237e-03 1.233e-03 2.625 0.00964 **
## category_visits 2.208e-05 1.000e-05 2.208 0.02891 *
## category_basket 3.483e-05 1.701e-05 2.047 0.04256 *
## category_favored -3.099e-04 1.078e-04 -2.875 0.00468 **
## category_brand_sold -2.751e-04 1.618e-04 -1.701 0.09121 .
## ty_visits -2.793e-08 2.391e-08 -1.168 0.24469
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.913 on 139 degrees of freedom
## (222 observations deleted due to missingness)
## Multiple R-squared: 0.8373, Adjusted R-squared: 0.8256
## F-statistic: 71.51 on 10 and 139 DF, p-value: < 2.2e-16
We’ve seen that basket_count, category_sold and category_favored are effective predictors.The adjusted R-squared value was also high.The p value of the f test is fine.
product3_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_visits + category_basket + category_favored +
category_brand_sold, product3_data)
summary(product3_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_visits + category_basket + category_favored + category_brand_sold,
## data = product3_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.1505 -0.7810 -0.1247 0.8607 21.1286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.745e-01 3.727e-01 1.005 0.315600
## basket_count 1.751e-01 6.137e-03 28.533 < 2e-16 ***
## favored_count -1.419e-02 2.973e-03 -4.774 2.62e-06 ***
## category_sold 2.781e-03 6.039e-04 4.605 5.72e-06 ***
## category_visits 1.230e-05 4.195e-06 2.933 0.003568 **
## category_basket 1.450e-05 6.045e-06 2.399 0.016959 *
## category_favored -1.949e-04 5.141e-05 -3.791 0.000176 ***
## category_brand_sold -1.145e-04 6.319e-05 -1.813 0.070711 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.389 on 364 degrees of freedom
## Multiple R-squared: 0.9349, Adjusted R-squared: 0.9336
## F-statistic: 746.6 on 7 and 364 DF, p-value: < 2.2e-16
AIC(product3_reg)
## [1] 2165.977
We further increased the adjusted R-squared value by removing redundant predictors.
product3_data <- cbind(product3_data, day_type)
names(product3_data)[14] <- "day_type"
product3_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_visits + category_basket + category_favored +
category_brand_sold + as.factor(day_type), product3_data)
summary(product3_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_visits + category_basket + category_favored + category_brand_sold +
## as.factor(day_type), data = product3_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.5747 -1.1502 0.0102 0.9884 20.5998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.756e-01 6.711e-01 -0.411 0.681580
## basket_count 1.756e-01 6.164e-03 28.492 < 2e-16 ***
## favored_count -1.442e-02 2.991e-03 -4.821 2.11e-06 ***
## category_sold 2.715e-03 6.118e-04 4.438 1.21e-05 ***
## category_visits 1.266e-05 4.220e-06 2.999 0.002894 **
## category_basket 1.417e-05 6.081e-06 2.330 0.020344 *
## category_favored -1.921e-04 5.204e-05 -3.692 0.000257 ***
## category_brand_sold -1.099e-04 6.375e-05 -1.725 0.085452 .
## as.factor(day_type)2 4.389e-01 8.520e-01 0.515 0.606769
## as.factor(day_type)3 2.546e-01 8.512e-01 0.299 0.765041
## as.factor(day_type)4 1.481e+00 8.518e-01 1.739 0.082884 .
## as.factor(day_type)5 1.479e-01 8.564e-01 0.173 0.862964
## as.factor(day_type)6 1.182e+00 8.553e-01 1.382 0.167878
## as.factor(day_type)7 9.857e-01 8.523e-01 1.157 0.248220
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.392 on 358 degrees of freedom
## Multiple R-squared: 0.9359, Adjusted R-squared: 0.9335
## F-statistic: 401.7 on 13 and 358 DF, p-value: < 2.2e-16
We added the day type regressor to the model, if the data includes daily seasonality, the model is expected to improve. The daytype regressor had no significant effect on the model.Therefore, we removed daytype regressor from the model.
product3_reg <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_visits + category_basket + category_favored +
category_brand_sold, product3_data)
summary(product3_reg)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_visits + category_basket + category_favored + category_brand_sold,
## data = product3_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.1505 -0.7810 -0.1247 0.8607 21.1286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.745e-01 3.727e-01 1.005 0.315600
## basket_count 1.751e-01 6.137e-03 28.533 < 2e-16 ***
## favored_count -1.419e-02 2.973e-03 -4.774 2.62e-06 ***
## category_sold 2.781e-03 6.039e-04 4.605 5.72e-06 ***
## category_visits 1.230e-05 4.195e-06 2.933 0.003568 **
## category_basket 1.450e-05 6.045e-06 2.399 0.016959 *
## category_favored -1.949e-04 5.141e-05 -3.791 0.000176 ***
## category_brand_sold -1.145e-04 6.319e-05 -1.813 0.070711 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.389 on 364 degrees of freedom
## Multiple R-squared: 0.9349, Adjusted R-squared: 0.9336
## F-statistic: 746.6 on 7 and 364 DF, p-value: < 2.2e-16
AIC(product3_reg)
## [1] 2165.977
We did decomposition at daily level for product1.Then, we build arima model with auto arima function.
datats3 <- ts(product3_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
ts_dec_add2 <- decompose(x = datats3, type = "additive")
plot(ts_dec_add2)
model3 = auto.arima(ts_dec_add2$random, max.p = 2, max.q = 2)
AIC(model3)
## [1] 15866.43
We compared the aic values of the arima model and the linear model and chose the linear model.In order to be able to forecast with linear model in the test period, we must also forecast the value of regressors by constructing arima models.
We build arima models by using past data for forecasting predictors or regressors of linear model of product 3.
we build arima model with decomposing at daily level for basket_count regressor and forecast the value of basket_count on the desired day in competition period.
ggplot(product3_data, aes(x = event_date, y = basket_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = basket_count)) + geom_line(colour = "firebrick2",
size = 1.5) + theme_dark()
basket_count_ts <- ts(product3_data$basket_count, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 6917.446
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 16)$pred
seasonality = basket_count_dec$seasonal[1:16]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 250.1791 263.0404 266.2987 251.2876 248.4188 250.5081 251.5405 252.5490
## [9] 256.6915 258.2287 250.0061 249.1856 251.0730 251.6152 252.7960 256.0298
ggplot(product3_data, aes(x = event_date, y = favored_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
ggplot(product3_data[as.Date(event_date) <= "2021-05-31" & as.Date(event_date) >=
"2021-05-01"], aes(x = event_date, y = favored_count)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
favored_count_ts <- ts(product3_data$favored_count, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 8573.249
favored_count_model_forecast <- predict(favored_count_model,
n.ahead = 16)$pred
seasonality = favored_count_dec$seasonal[1:16]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)],
1)
favored_count_model_forecast = favored_count_model_forecast +
last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 407.7535 433.2656 441.7602 416.3097 404.7530 403.8694 404.2885 407.7535
## [9] 433.2656 441.7602 416.3097 404.7530 403.8694 404.2885 407.7535 433.2656
ggplot(product3_data, aes(x = event_date, y = category_sold)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_sold_ts <- ts(product3_data$category_sold, start = as.Date("2021-04-07"),
end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 5207.815
category_sold_model_forecast <- predict(category_sold_model,
n.ahead = 16)$pred
seasonality = category_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)],
1)
category_sold_model_forecast = category_sold_model_forecast +
last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 4684.988 4829.658 4458.404 4249.587 4258.094 4427.342 4397.424 3995.785
## [9] 4200.285 4119.709 4279.729 4276.953 4215.899 4252.005 4399.121 4460.895
ggplot(product3_data, aes(x = event_date, y = category_visits)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_visits_ts <- ts(product3_data$category_visits, start = as.Date("2021-04-07"),
end = as.Date("2021-05-31"), frequency = 7)
category_visits_dec <- decompose(x = category_visits_ts, type = "additive")
category_visits_model = auto.arima(category_visits_dec$random,
max.p = 3, max.q = 3)
AIC(category_visits_model)
## [1] 9171.247
category_visits_model_forecast <- predict(category_visits_model,
n.ahead = 16)$pred
seasonality = category_visits_dec$seasonal[1:16]
last_trend_value <- tail(category_visits_dec$trend[!is.na(category_visits_dec$trend)],
1)
category_visits_model_forecast = category_visits_model_forecast +
last_trend_value + seasonality
category_visits_model_forecast = abs(category_visits_model_forecast)
category_visits_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 1523918.18 940639.85 83084.83 470609.77 162197.03 1143245.98
## [7] 1415453.68 722371.87 166027.58 335869.24 368998.81 1190433.93
## [13] 1285940.95 549429.64 223524.92 206304.91
ggplot(product3_data, aes(x = event_date, y = category_basket)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_basket_ts <- ts(product3_data$category_basket, start = as.Date("2021-02-13"),
end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random,
max.p = 2, max.q = 2)
AIC(category_basket_model)
## [1] 18077.75
category_basket_model_forecast <- predict(category_basket_model,
n.ahead = 16)$pred
seasonality = category_basket_dec$seasonal[1:16]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)],
1)
category_basket_model_forecast = category_basket_model_forecast +
last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 495831.4 344890.7 251235.0 241199.6 273443.1 340637.0 391791.4 409243.7
## [9] 418889.1 409840.0 387829.4 309074.0 279205.0 293899.9 333281.9 375688.2
fitted = auto.arima(product3_data$category_favored)
b = forecast(fitted, h = 16)
category_favored_model_forecast = b$mean
category_favored_model_forecast
## Time Series:
## Start = 373
## End = 388
## Frequency = 1
## [1] 9601.316 9511.373 9382.591 9259.483 9188.624 9151.846 9134.385 9128.304
## [9] 9127.363 9128.146 9129.241 9130.118 9130.676 9130.978 9131.119 9131.170
ggplot(product3_data, aes(x = event_date, y = category_brand_sold)) +
geom_line(colour = "firebrick2", size = 1.5) + theme_dark()
category_brand_sold_ts <- ts(product3_data$category_brand_sold,
start = as.Date("2021-01-31"), end = as.Date("2021-05-31"),
frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts,
type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random,
max.p = 2, max.q = 2)
AIC(category_brand_sold_model)
## [1] 17175.24
category_brand_sold_model_forecast <- predict(category_brand_sold_model,
n.ahead = 16)$pred
seasonality = category_brand_sold_dec$seasonal[1:16]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)],
1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast +
last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18780, 3)
## Frequency = 7
## [1] 84717.26 104296.22 89807.20 69501.46 63849.94 73107.90 82429.50
## [8] 83365.63 79142.25 73190.98 74166.85 77429.82 79073.77 77369.99
## [15] 75087.70 76381.26
We made our predictions in the linear model we built by using the fitted values of the regressors we found in the arima models.
predict(product3_reg, data.frame(basket_count = basket_count_model_forecast[16],
favored_count = favored_count_model_forecast[16], category_sold = category_sold_model_forecast[16],
category_visits = category_visits_model_forecast[16], category_basket = category_basket_model_forecast[16],
category_favored = category_favored_model_forecast[16], category_brand_sold = category_brand_sold_model_forecast[16]))
## 1
## 48.9182
data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
projectdata <- fread(data_path)
You can also embed plots, for example:
IslakMendil <- projectdata[product_content_id == 4066298]
head(IslakMendil)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 4066298 65.80651 1398 19102 5096
## 2: 2021-05-30 4066298 72.50000 598 6217 1716
## 3: 2021-05-29 4066298 72.51379 528 6251 1782
## 4: 2021-05-28 4066298 72.50000 538 5564 1507
## 5: 2021-05-27 4066298 72.50000 588 5857 1646
## 6: 2021-05-26 4066298 72.50000 589 6089 1771
## favored_count category_sold category_visits category_basket category_favored
## 1: 1703 6547 108811 28558 11913
## 2: 494 3691 52128 12777 5221
## 3: 509 3604 54600 13328 5178
## 4: 405 1598 3308 48466 12414
## 5: 397 1775 3789 53058 13480
## 6: 409 1948 4017 56226 14497
## category_brand_sold ty_visits
## 1: 4286 125439876
## 2: 1729 131821083
## 3: 1739 129670029
## 4: 4389 103514886
## 5: 4685 107391579
## 6: 5388 106195988
ts.plot(IslakMendil[, c("sold_count")], main = "Daily Sales Quantity",
xlab = "Time", ylab = "Sales Quantity")
In the plot we can see that sales quantity goes costant most of the time but sometimes there are huge increases probably due to the discounts.
We are adding month, day, trend attributes to the data in order to better check and we are taking the logarithm of the sold_count in order to reduce the variance’s effect.
IslakMendil[, `:=`(month, month(event_date))]
IslakMendil[, `:=`(day, lubridate::wday(event_date))]
IslakMendil[, `:=`(trend, 1:.N)]
IslakMendil[, `:=`(log_sold, log(sold_count))]
ts.plot(IslakMendil[, c("log_sold")], main = "Daily Log Sales Quantity",
xlab = "Time", ylab = "Sales Quantity Log")
After trying lots of models, the final model is like that with 0.9462 Adjusted R Squared value.
lmIslakMendil <- lm(sold_count ~ basket_count + category_sold +
category_visits + category_favored + category_brand_sold,
data = IslakMendil)
summary(lmIslakMendil)
##
## Call:
## lm(formula = sold_count ~ basket_count + category_sold + category_visits +
## category_favored + category_brand_sold, data = IslakMendil)
##
## Residuals:
## Min 1Q Median 3Q Max
## -515.75 -34.94 3.30 41.24 924.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.0030329 8.8606618 1.016 0.31
## basket_count 0.2482176 0.0103450 23.994 < 2e-16 ***
## category_sold 0.1951942 0.0081431 23.970 < 2e-16 ***
## category_visits -0.0081666 0.0009448 -8.643 < 2e-16 ***
## category_favored -0.0127085 0.0022058 -5.761 1.77e-08 ***
## category_brand_sold -0.0209877 0.0022743 -9.228 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97.93 on 366 degrees of freedom
## Multiple R-squared: 0.947, Adjusted R-squared: 0.9462
## F-statistic: 1307 on 5 and 366 DF, p-value: < 2.2e-16
checkresiduals(lmIslakMendil)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 37.645, df = 10, p-value = 4.375e-05
Residuals have zero mean and around zero variance except extraordinary a few points. Residuals look like normally distributed.
##Decomposing
We took the logarithm of the model and we will go with the additive decomposition.
# time series and arima
tsIslakMendil <- ts(IslakMendil$sold_count, frequency = 7)
ts.plot(tsIslakMendil)
decompIslakMendil <- decompose(tsIslakMendil, type = "additive")
acf(tsIslakMendil)
pacf(tsIslakMendil)
finalARIMAmodel2 <- arima(decompIslakMendil$random, order = c(0,
0, 1))
finalARIMAmodel2
##
## Call:
## arima(x = decompIslakMendil$random, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.2775 -0.3595
## s.e. 0.0471 16.3751
##
## sigma^2 estimated as 60205: log likelihood = -2533.38, aic = 5072.76
By checking the ACF and PACF plots, we decided to create ARIMA(0,0,1) models.
##ARIMA Models of Attributes
basket_count_ts <- ts(IslakMendil$basket_count, start = as.Date("2021-01-04"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 15816.63
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 1584.868
category_sold_ts <- ts(IslakMendil$category_sold, start = as.Date("2021-02-10"),
end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 12217.86
category_sold_model_forecast <- predict(category_sold_model,
n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)],
1)
category_sold_model_forecast = category_sold_model_forecast +
last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 1867.783
category_favored_ts <- ts(IslakMendil$category_favored, start = as.Date("2021-02-10"),
end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 14350.23
category_favored_model_forecast <- predict(category_favored_model,
n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)],
1)
category_favored_model_forecast = category_favored_model_forecast +
last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 15301.39
category_visits_ts <- ts(IslakMendil$category_visits, start = as.Date("2021-02-10"),
end = as.Date("2021-05-31"), frequency = 7)
category_visits_dec <- decompose(x = category_visits_ts, type = "additive")
category_visits_model = auto.arima(category_visits_dec$random)
AIC(category_visits_model)
## [1] 15081.26
category_visits_model_forecast <- predict(category_visits_model,
n.ahead = 1)$pred
seasonality = category_visits_dec$seasonal[1:1]
last_trend_value <- tail(category_visits_dec$trend[!is.na(category_visits_dec$trend)],
1)
category_visits_model_forecast = category_visits_model_forecast +
last_trend_value + seasonality
category_visits_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 4019.674
category_brand_sold_ts <- ts(IslakMendil$category_brand_sold,
start = as.Date("2021-02-10"), end = as.Date("2021-05-31"),
frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts,
type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 12442.73
category_brand_sold_model_forecast <- predict(category_brand_sold_model,
n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)],
1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast +
last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 5193.39
We have forecasted the regressors belong to the lmIslakMendil model, now, we will predict the sold_quantity.
predict(lmIslakMendil, data.frame(basket_count = basket_count_model_forecast,
category_sold = category_sold_model_forecast, category_favored = category_favored_model_forecast,
category_visits = category_visits_model_forecast, category_brand_sold = category_brand_sold_model_forecast))
## 1
## 430.6934
Our prediction for the quantity sold is 430.6934
head(IslakMendil$sold_count, 50)
## [1] 1398 598 528 538 588 589 568 632 727 606 581 768 611 854 940
## [16] 198 144 128 88 263 204 250 131 121 150 183 226 202 249 427
## [31] 489 613 916 993 1132 688 212 352 1140 726 258 292 408 481 422
## [46] 447 533 366 358 329
##YuzTemizleyici
YuzTemizleyici <- projectdata[product_content_id == 85004]
head(YuzTemizleyici)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 85004 87.39562 80 6002 744
## 2: 2021-05-30 85004 88.62941 85 5265 633
## 3: 2021-05-29 85004 89.57701 67 4925 570
## 4: 2021-05-28 85004 89.22423 52 3774 397
## 5: 2021-05-27 85004 87.25352 71 4189 515
## 6: 2021-05-26 85004 82.22640 125 5641 982
## favored_count category_sold category_visits category_basket category_favored
## 1: 1642 5048 236197 33681 40472
## 2: 1104 4117 215260 25181 36225
## 3: 913 3872 198431 23790 31700
## 4: 868 309 3057 163110 18821
## 5: 985 411 3589 176215 21144
## 6: 932 971 5107 217245 30724
## category_brand_sold ty_visits
## 1: 743 125439876
## 2: 430 131821083
## 3: 437 129670029
## 4: 25401 103514886
## 5: 25610 107391579
## 6: 30844 106195988
summary(YuzTemizleyici)
## event_date product_content_id price sold_count
## Min. :2020-05-25 Min. :85004 Min. :64.90 Min. : 14.0
## 1st Qu.:2020-08-25 1st Qu.:85004 1st Qu.:74.58 1st Qu.: 33.0
## Median :2020-11-26 Median :85004 Median :77.76 Median : 57.0
## Mean :2020-11-26 Mean :85004 Mean :77.66 Mean : 74.2
## 3rd Qu.:2021-02-27 3rd Qu.:85004 3rd Qu.:80.68 3rd Qu.: 87.5
## Max. :2021-05-31 Max. :85004 Max. :89.95 Max. :447.0
## visit_count basket_count favored_count category_sold
## Min. : 0 Min. : 62.0 Min. : 0.0 Min. : 91.0
## 1st Qu.: 0 1st Qu.: 154.8 1st Qu.: 0.0 1st Qu.: 217.8
## Median : 0 Median : 288.0 Median : 0.0 Median : 342.0
## Mean :1405 Mean : 354.1 Mean : 401.5 Mean : 484.4
## 3rd Qu.:3183 3rd Qu.: 482.0 3rd Qu.: 696.0 3rd Qu.: 547.5
## Max. :8325 Max. :1338.0 Max. :2882.0 Max. :5048.0
## category_visits category_basket category_favored category_brand_sold
## Min. : 1476 Min. : 0 Min. : 7465 Min. : 0
## 1st Qu.: 2688 1st Qu.: 0 1st Qu.:13236 1st Qu.: 0
## Median : 3468 Median : 0 Median :18423 Median : 0
## Mean : 6005 Mean : 75121 Mean :22250 Mean :16556
## 3rd Qu.: 4610 3rd Qu.:186296 3rd Qu.:26998 3rd Qu.:31558
## Max. :236197 Max. :467288 Max. :69429 Max. :73350
## ty_visits
## Min. : 1
## 1st Qu.: 1
## Median : 1
## Mean : 39145174
## 3rd Qu.: 97619487
## Max. :178545693
ts.plot(YuzTemizleyici[, c("sold_count")], main = "Daily Sales Quantity",
xlab = "Time", ylab = "Sales Quantity")
acf(YuzTemizleyici$sold_count, lag.max = 90)
From the Daily Sales Quantity graph, we see some seasonality. Also, there are two separate means which is splitted approximately at Time 200.
Also, we can mention some outliers and we want to avoid from these outliers.
If we check the autocorrelation, we see decreasing trend and some pattern at the level of lag=90, which implies that there is a trend and seasonality in the sales data.
for (i in 1:length(YuzTemizleyici$sold_count)) {
if (YuzTemizleyici$sold_count[i] >= 200) {
YuzTemizleyici$sold_count[i] = mean(YuzTemizleyici$sold_count[i -
7:i + 7])
}
}
ts.plot(YuzTemizleyici[, c("sold_count")], main = "Daily Sales Quantity",
xlab = "Time", ylab = "Sales Quantity")
acf(YuzTemizleyici$sold_count, lag.max = 90)
We determined a sales quantity level, which is 200 and choose these points as outliers. In order to get rid of outliers, we take the average of closest 15 days to the outlier points and assigned these new value to the outlier point.In this way, we get rid of outliers but autocorrelation function gives higher results.
We add day, month and trend columns to the data.
Since the variance looks high and getting higher, we take the log of the sold_count.
YuzTemizleyici[, `:=`(month, month(event_date))]
YuzTemizleyici[, `:=`(day, lubridate::wday(event_date))]
YuzTemizleyici[, `:=`(trend, 1:.N)]
YuzTemizleyici[, `:=`(log_sold, log(sold_count))]
ts.plot(YuzTemizleyici[, c("log_sold")], main = "Daily Log Sales Quantity",
xlab = "Time", ylab = "Sales Quantity Log")
From now on, we will be trying different lm models with different attributes and we want to achieve the best regresssion model.
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_visits + category_basket +
category_favored + category_brand_sold + ty_visits + month +
day + trend, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_visits + category_basket +
## category_favored + category_brand_sold + ty_visits + month +
## day + trend, data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.094 -8.726 -0.560 7.231 72.050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.631e+02 2.715e+01 6.006 4.66e-09 ***
## price -1.947e+00 3.029e-01 -6.427 4.16e-10 ***
## visit_count 1.258e-02 2.603e-03 4.832 2.01e-06 ***
## basket_count 9.236e-02 1.223e-02 7.549 3.67e-13 ***
## favored_count -1.920e-02 4.570e-03 -4.202 3.35e-05 ***
## category_sold -2.580e-02 5.677e-03 -4.545 7.53e-06 ***
## category_visits 5.931e-05 1.305e-04 0.455 0.64965
## category_basket -4.067e-04 5.909e-05 -6.883 2.64e-11 ***
## category_favored 1.652e-03 1.888e-04 8.746 < 2e-16 ***
## category_brand_sold 7.575e-04 1.507e-04 5.026 7.92e-07 ***
## ty_visits 3.000e-07 1.024e-07 2.930 0.00361 **
## month -8.123e-01 4.014e-01 -2.024 0.04374 *
## day -5.860e-01 4.594e-01 -1.276 0.20292
## trend -8.099e-04 2.097e-02 -0.039 0.96922
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.38 on 358 degrees of freedom
## Multiple R-squared: 0.8128, Adjusted R-squared: 0.806
## F-statistic: 119.5 on 13 and 358 DF, p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + month + trend, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + month + trend, data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.787 -8.952 -0.527 6.851 73.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.543e+02 2.637e+01 5.853 1.09e-08 ***
## price -1.879e+00 2.986e-01 -6.293 9.06e-10 ***
## visit_count 1.290e-02 2.368e-03 5.449 9.43e-08 ***
## basket_count 9.007e-02 1.019e-02 8.835 < 2e-16 ***
## favored_count -1.906e-02 4.564e-03 -4.175 3.74e-05 ***
## category_sold -2.341e-02 3.326e-03 -7.039 9.83e-12 ***
## category_basket -4.179e-04 5.236e-05 -7.982 1.95e-14 ***
## category_favored 1.645e-03 1.884e-04 8.730 < 2e-16 ***
## category_brand_sold 7.810e-04 1.493e-04 5.230 2.88e-07 ***
## ty_visits 3.107e-07 1.003e-07 3.098 0.0021 **
## month -7.796e-01 3.981e-01 -1.958 0.0510 .
## trend 2.807e-03 2.080e-02 0.135 0.8927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.38 on 360 degrees of freedom
## Multiple R-squared: 0.8118, Adjusted R-squared: 0.806
## F-statistic: 141.2 on 11 and 360 DF, p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + month + trend + as.factor(day),
data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + month + trend + as.factor(day),
## data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.705 -8.185 -1.266 7.948 70.298
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.639e+02 2.674e+01 6.130 2.34e-09 ***
## price -2.047e+00 3.000e-01 -6.823 3.86e-11 ***
## visit_count 1.326e-02 2.356e-03 5.626 3.76e-08 ***
## basket_count 8.573e-02 1.015e-02 8.449 7.74e-16 ***
## favored_count -1.783e-02 4.522e-03 -3.942 9.73e-05 ***
## category_sold -2.477e-02 3.311e-03 -7.482 5.85e-13 ***
## category_basket -4.480e-04 5.240e-05 -8.549 3.77e-16 ***
## category_favored 1.670e-03 1.863e-04 8.964 < 2e-16 ***
## category_brand_sold 7.957e-04 1.482e-04 5.370 1.43e-07 ***
## ty_visits 3.729e-07 1.008e-07 3.701 0.000249 ***
## month -7.743e-01 3.956e-01 -1.957 0.051088 .
## trend 5.458e-03 2.081e-02 0.262 0.793241
## as.factor(day)2 6.161e+00 3.357e+00 1.835 0.067328 .
## as.factor(day)3 4.774e+00 3.393e+00 1.407 0.160257
## as.factor(day)4 4.780e+00 3.412e+00 1.401 0.162011
## as.factor(day)5 8.350e+00 3.396e+00 2.458 0.014431 *
## as.factor(day)6 9.185e-01 3.399e+00 0.270 0.787122
## as.factor(day)7 -3.305e+00 3.354e+00 -0.985 0.325145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.13 on 354 degrees of freedom
## Multiple R-squared: 0.8202, Adjusted R-squared: 0.8115
## F-statistic: 94.97 on 17 and 354 DF, p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + as.factor(month) + as.factor(day),
data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + as.factor(month) + as.factor(day),
## data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.245 -9.025 -0.907 7.586 74.700
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.906e+02 2.819e+01 6.763 5.80e-11 ***
## price -2.229e+00 3.273e-01 -6.809 4.38e-11 ***
## visit_count 1.209e-02 2.668e-03 4.530 8.13e-06 ***
## basket_count 7.837e-02 1.174e-02 6.673 9.99e-11 ***
## favored_count -1.618e-02 4.710e-03 -3.435 0.000664 ***
## category_sold -2.383e-02 3.487e-03 -6.834 3.74e-11 ***
## category_basket -3.549e-04 6.846e-05 -5.184 3.70e-07 ***
## category_favored 1.739e-03 1.933e-04 8.995 < 2e-16 ***
## category_brand_sold 4.423e-04 2.453e-04 1.803 0.072219 .
## ty_visits 3.725e-07 9.860e-08 3.778 0.000186 ***
## as.factor(month)2 -1.843e+01 7.919e+00 -2.327 0.020564 *
## as.factor(month)3 -1.525e+01 7.825e+00 -1.949 0.052144 .
## as.factor(month)4 -2.222e+01 8.397e+00 -2.646 0.008508 **
## as.factor(month)5 -1.508e+01 8.285e+00 -1.821 0.069534 .
## as.factor(month)6 -1.015e+01 8.012e+00 -1.267 0.206124
## as.factor(month)7 -1.988e+01 8.140e+00 -2.443 0.015077 *
## as.factor(month)8 -1.959e+01 8.303e+00 -2.360 0.018849 *
## as.factor(month)9 -1.744e+01 8.239e+00 -2.117 0.034979 *
## as.factor(month)10 -1.589e+01 8.027e+00 -1.979 0.048561 *
## as.factor(month)11 -2.218e+01 8.196e+00 -2.706 0.007147 **
## as.factor(month)12 -9.922e+00 4.930e+00 -2.013 0.044937 *
## as.factor(day)2 5.948e+00 3.345e+00 1.778 0.076268 .
## as.factor(day)3 3.933e+00 3.415e+00 1.152 0.250177
## as.factor(day)4 3.736e+00 3.442e+00 1.085 0.278545
## as.factor(day)5 7.402e+00 3.434e+00 2.155 0.031820 *
## as.factor(day)6 5.805e-02 3.423e+00 0.017 0.986480
## as.factor(day)7 -4.110e+00 3.362e+00 -1.223 0.222279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.03 on 345 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8138
## F-statistic: 63.36 on 26 and 345 DF, p-value: < 2.2e-16
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + as.factor(month) + trend +
as.factor(day), data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + as.factor(month) + trend +
## as.factor(day), data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.107 -9.139 -0.956 7.566 74.958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.946e+02 3.062e+01 6.355 6.60e-10 ***
## price -2.247e+00 3.323e-01 -6.761 5.87e-11 ***
## visit_count 1.186e-02 2.754e-03 4.307 2.16e-05 ***
## basket_count 7.798e-02 1.182e-02 6.600 1.56e-10 ***
## favored_count -1.608e-02 4.726e-03 -3.403 0.000746 ***
## category_sold -2.378e-02 3.495e-03 -6.804 4.53e-11 ***
## category_basket -3.458e-04 7.377e-05 -4.688 3.98e-06 ***
## category_favored 1.748e-03 1.953e-04 8.949 < 2e-16 ***
## category_brand_sold 4.162e-04 2.577e-04 1.615 0.107202
## ty_visits 3.518e-07 1.165e-07 3.019 0.002724 **
## as.factor(month)2 -1.765e+01 8.258e+00 -2.138 0.033243 *
## as.factor(month)3 -1.495e+01 7.885e+00 -1.897 0.058726 .
## as.factor(month)4 -2.198e+01 8.438e+00 -2.605 0.009585 **
## as.factor(month)5 -1.414e+01 8.765e+00 -1.613 0.107681
## as.factor(month)6 -8.480e+00 9.448e+00 -0.898 0.370015
## as.factor(month)7 -1.862e+01 8.991e+00 -2.070 0.039160 *
## as.factor(month)8 -1.874e+01 8.701e+00 -2.153 0.031991 *
## as.factor(month)9 -1.693e+01 8.388e+00 -2.019 0.044261 *
## as.factor(month)10 -1.569e+01 8.059e+00 -1.946 0.052410 .
## as.factor(month)11 -2.240e+01 8.233e+00 -2.721 0.006845 **
## as.factor(month)12 -9.779e+00 4.955e+00 -1.974 0.049232 *
## trend -1.186e-02 3.549e-02 -0.334 0.738440
## as.factor(day)2 5.813e+00 3.374e+00 1.723 0.085782 .
## as.factor(day)3 3.795e+00 3.444e+00 1.102 0.271298
## as.factor(day)4 3.540e+00 3.496e+00 1.013 0.311882
## as.factor(day)5 7.201e+00 3.490e+00 2.063 0.039856 *
## as.factor(day)6 -1.457e-01 3.481e+00 -0.042 0.966629
## as.factor(day)7 -4.175e+00 3.372e+00 -1.238 0.216436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.05 on 344 degrees of freedom
## Multiple R-squared: 0.8269, Adjusted R-squared: 0.8133
## F-statistic: 60.86 on 27 and 344 DF, p-value: < 2.2e-16
Up to now, we’ve achieved 0.8159 Adjusted R-Squared value.
We’re adding the Residual attribute:
YuzTemizleyici <- YuzTemizleyici[, `:=`(Residual, 0)]
YuzTemizleyici$Residual[1] = NA
YuzTemizleyici$Residual[2:372] <- residuals(lmYuzTemizleyici)[1:371]
lmYuzTemizleyici <- lm(log_sold ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + month + trend + as.factor(day) +
Residual, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = log_sold ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + month + trend + as.factor(day) +
## Residual, data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82704 -0.15155 0.00492 0.17851 0.80531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.294e+00 4.125e-01 15.257 < 2e-16 ***
## price -3.598e-02 4.627e-03 -7.778 8.25e-14 ***
## visit_count 1.597e-04 3.670e-05 4.351 1.77e-05 ***
## basket_count 1.061e-03 1.608e-04 6.599 1.52e-10 ***
## favored_count -2.680e-04 7.007e-05 -3.824 0.000155 ***
## category_sold -3.528e-04 5.636e-05 -6.260 1.12e-09 ***
## category_basket -7.311e-06 8.086e-07 -9.041 < 2e-16 ***
## category_favored 2.581e-05 2.877e-06 8.972 < 2e-16 ***
## category_brand_sold 1.174e-05 2.286e-06 5.137 4.63e-07 ***
## ty_visits 7.016e-09 1.556e-09 4.509 8.89e-06 ***
## month -3.368e-02 6.101e-03 -5.520 6.60e-08 ***
## trend -7.851e-04 3.211e-04 -2.445 0.014980 *
## as.factor(day)2 1.027e-01 5.198e-02 1.976 0.048897 *
## as.factor(day)3 8.666e-02 5.232e-02 1.656 0.098567 .
## as.factor(day)4 7.739e-02 5.259e-02 1.472 0.142027
## as.factor(day)5 1.311e-01 5.236e-02 2.503 0.012753 *
## as.factor(day)6 1.623e-02 5.241e-02 0.310 0.756969
## as.factor(day)7 -4.111e-02 5.170e-02 -0.795 0.427092
## Residual 3.120e-03 8.391e-04 3.719 0.000233 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.264 on 352 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8121, Adjusted R-squared: 0.8025
## F-statistic: 84.53 on 18 and 352 DF, p-value: < 2.2e-16
We’re adding the lag_1 attribute:
YuzTemizleyici[, `:=`(lag_1, NA)]
YuzTemizleyici$lag_1[2:372] <- YuzTemizleyici$sold_count[1:371]
lmYuzTemizleyici <- lm(sold_count ~ price + visit_count + basket_count +
favored_count + category_sold + category_basket + category_favored +
category_brand_sold + ty_visits + as.factor(month) + trend +
as.factor(day) + Residual + lag_1, data = YuzTemizleyici)
summary(lmYuzTemizleyici)
##
## Call:
## lm(formula = sold_count ~ price + visit_count + basket_count +
## favored_count + category_sold + category_basket + category_favored +
## category_brand_sold + ty_visits + as.factor(month) + trend +
## as.factor(day) + Residual + lag_1, data = YuzTemizleyici)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.073 -8.805 -0.641 7.333 75.074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.189e+02 3.258e+01 6.718 7.72e-11 ***
## price -2.495e+00 3.524e-01 -7.082 8.16e-12 ***
## visit_count 1.300e-02 2.766e-03 4.701 3.77e-06 ***
## basket_count 8.190e-02 1.191e-02 6.879 2.89e-11 ***
## favored_count -1.681e-02 4.686e-03 -3.588 0.000382 ***
## category_sold -2.576e-02 3.845e-03 -6.701 8.58e-11 ***
## category_basket -3.742e-04 7.398e-05 -5.059 6.92e-07 ***
## category_favored 1.883e-03 2.068e-04 9.103 < 2e-16 ***
## category_brand_sold 4.297e-04 2.547e-04 1.687 0.092460 .
## ty_visits 3.626e-07 1.172e-07 3.094 0.002138 **
## as.factor(month)2 -1.724e+01 8.230e+00 -2.094 0.036968 *
## as.factor(month)3 -1.493e+01 7.903e+00 -1.889 0.059793 .
## as.factor(month)4 -2.166e+01 8.438e+00 -2.567 0.010692 *
## as.factor(month)5 -1.383e+01 8.654e+00 -1.598 0.110888
## as.factor(month)6 -8.408e+00 9.316e+00 -0.903 0.367407
## as.factor(month)7 -1.969e+01 8.936e+00 -2.204 0.028203 *
## as.factor(month)8 -2.013e+01 8.647e+00 -2.328 0.020509 *
## as.factor(month)9 -1.829e+01 8.318e+00 -2.199 0.028548 *
## as.factor(month)10 -1.729e+01 7.996e+00 -2.162 0.031286 *
## as.factor(month)11 -2.415e+01 8.159e+00 -2.959 0.003297 **
## as.factor(month)12 -1.072e+01 4.928e+00 -2.175 0.030330 *
## trend -2.156e-02 3.502e-02 -0.616 0.538468
## as.factor(day)2 6.390e+00 3.368e+00 1.897 0.058626 .
## as.factor(day)3 3.865e+00 3.392e+00 1.139 0.255316
## as.factor(day)4 3.539e+00 3.440e+00 1.029 0.304385
## as.factor(day)5 6.490e+00 3.446e+00 1.883 0.060527 .
## as.factor(day)6 -1.152e+00 3.452e+00 -0.334 0.738905
## as.factor(day)7 -4.801e+00 3.332e+00 -1.441 0.150577
## Residual 2.675e-01 7.360e-02 3.634 0.000322 ***
## lag_1 -8.671e-02 5.044e-02 -1.719 0.086520 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.77 on 341 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.8338, Adjusted R-squared: 0.8197
## F-statistic: 59.01 on 29 and 341 DF, p-value: < 2.2e-16
AIC(lmYuzTemizleyici)
## [1] 3175.888
# this is the best model I can achieve.
With the 0.8224 Adjusted R-squared value, this is the best model we achieved.
checkresiduals(lmYuzTemizleyici)
##
## Breusch-Godfrey test for serial correlation of order up to 33
##
## data: Residuals
## LM test = 33.684, df = 33, p-value = 0.4342
Residuals seem have 0 mean, although the variance increases at some points. There aren’t any significant autocorrelation and residuals looks like normally distributed.
##Decomposing the Data
# time series and arima
tsYuzTemizleyici <- ts(YuzTemizleyici$log_sold, frequency = 7)
ts.plot(tsYuzTemizleyici)
finaldecompYuzTemizleyici <- decompose(tsYuzTemizleyici, type = "additive")
plot(finaldecompYuzTemizleyici)
auto.arima(finaldecompYuzTemizleyici$random, seasonal = FALSE,
trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,0,2) with non-zero mean : 29.53309
## ARIMA(0,0,0) with non-zero mean : 187.0031
## ARIMA(1,0,0) with non-zero mean : 173.1295
## ARIMA(0,0,1) with non-zero mean : 165.6274
## ARIMA(0,0,0) with zero mean : 184.9823
## ARIMA(1,0,2) with non-zero mean : 73.04316
## ARIMA(2,0,1) with non-zero mean : 36.10984
## ARIMA(3,0,2) with non-zero mean : Inf
## ARIMA(2,0,3) with non-zero mean : 30.68142
## ARIMA(1,0,1) with non-zero mean : 167.7132
## ARIMA(1,0,3) with non-zero mean : Inf
## ARIMA(3,0,1) with non-zero mean : 28.69084
## ARIMA(3,0,0) with non-zero mean : 117.2414
## ARIMA(4,0,1) with non-zero mean : 54.01181
## ARIMA(2,0,0) with non-zero mean : 146.9046
## ARIMA(4,0,0) with non-zero mean : 108.0943
## ARIMA(4,0,2) with non-zero mean : 32.36692
## ARIMA(3,0,1) with zero mean : 27.27624
## ARIMA(2,0,1) with zero mean : 34.5864
## ARIMA(3,0,0) with zero mean : 115.1856
## ARIMA(4,0,1) with zero mean : 52.33744
## ARIMA(3,0,2) with zero mean : Inf
## ARIMA(2,0,0) with zero mean : 144.8623
## ARIMA(2,0,2) with zero mean : 27.9751
## ARIMA(4,0,0) with zero mean : 106.0316
## ARIMA(4,0,2) with zero mean : 30.83249
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(3,0,1) with zero mean : Inf
## ARIMA(2,0,2) with zero mean : Inf
## ARIMA(3,0,1) with non-zero mean : Inf
## ARIMA(2,0,2) with non-zero mean : Inf
## ARIMA(2,0,3) with non-zero mean : Inf
## ARIMA(4,0,2) with zero mean : Inf
## ARIMA(4,0,2) with non-zero mean : Inf
## ARIMA(2,0,1) with zero mean : Inf
## ARIMA(2,0,1) with non-zero mean : Inf
## ARIMA(4,0,1) with zero mean : Inf
## ARIMA(4,0,1) with non-zero mean : Inf
## ARIMA(1,0,2) with non-zero mean : Inf
## ARIMA(4,0,0) with zero mean : 106.9794
##
## Best model: ARIMA(4,0,0) with zero mean
## Series: finaldecompYuzTemizleyici$random
## ARIMA(4,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4
## 0.1337 -0.2320 -0.2587 -0.1645
## s.e. 0.0517 0.0503 0.0503 0.0517
##
## sigma^2 estimated as 0.077: log likelihood=-48.41
## AIC=106.81 AICc=106.98 BIC=126.33
YTARIMAmodel2 <- arima(finaldecompYuzTemizleyici$random, order = c(3,
0, 1))
YTARIMAmodel2
##
## Call:
## arima(x = finaldecompYuzTemizleyici$random, order = c(3, 0, 1))
##
## Coefficients:
## ar1 ar2 ar3 ma1 intercept
## 0.8616 -0.3719 -0.1223 -1.0000 0e+00
## s.e. 0.0519 0.0659 0.0519 0.0069 2e-04
##
## sigma^2 estimated as 0.05866: log likelihood = -3.38, aic = 18.76
AIC(YTARIMAmodel2)
## [1] 18.76292
##ARIMA Models for Attributes
We’ve created ARIMA models for the attributes in the our final regression model. In order to predict sold quantity, we have to predict attributes first.
price_ts <- ts(YuzTemizleyici$price, start = as.Date("2021-05-04"),
end = as.Date("2021-05-31"), frequency = 7)
price_dec <- decompose(x = price_ts, type = "additive")
price_model = auto.arima(price_dec$random)
AIC(price_model)
## [1] 767.0668
price_model_forecast <- predict(price_model, n.ahead = 1)$pred
seasonality = price_dec$seasonal[1:1]
last_trend_value <- tail(price_dec$trend[!is.na(price_dec$trend)],
1)
price_model_forecast = price_model_forecast + last_trend_value +
seasonality
price_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 66.95522
visit_count_ts <- ts(YuzTemizleyici$visit_count, start = as.Date("2021-01-28"),
end = as.Date("2021-05-31"), frequency = 7)
visit_count_dec <- decompose(x = visit_count_ts, type = "additive")
visit_count_model = auto.arima(visit_count_dec$random)
AIC(visit_count_model)
## [1] 12932.84
visit_count_model_forecast <- predict(visit_count_model, n.ahead = 1)$pred
seasonality = visit_count_dec$seasonal[1:1]
last_trend_value <- tail(visit_count_dec$trend[!is.na(visit_count_dec$trend)],
1)
visit_count_model_forecast = visit_count_model_forecast + last_trend_value +
seasonality
visit_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 4286.376
basket_count_ts <- ts(YuzTemizleyici$basket_count, start = as.Date("2021-04-28"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 2747.135
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 178.9897
favored_count_ts <- ts(YuzTemizleyici$favored_count, start = as.Date("2021-01-28"),
end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 11869.29
favored_count_model_forecast <- predict(favored_count_model,
n.ahead = 1)$pred
seasonality = favored_count_dec$seasonal[1:1]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)],
1)
favored_count_model_forecast = favored_count_model_forecast +
last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 719.3471
category_sold_ts <- ts(YuzTemizleyici$category_sold, start = as.Date("2021-04-28"),
end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 3125.509
category_sold_model_forecast <- predict(category_sold_model,
n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)],
1)
category_sold_model_forecast = category_sold_model_forecast +
last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 324.108
category_basket_ts <- ts(YuzTemizleyici$category_basket, start = as.Date("2021-01-28"),
end = as.Date("2021-05-31"), frequency = 7)
category_basket_dec <- decompose(x = category_basket_ts, type = "additive")
category_basket_model = auto.arima(category_basket_dec$random)
AIC(category_basket_model)
## [1] 19434.99
category_basket_model_forecast <- predict(category_basket_model,
n.ahead = 1)$pred
seasonality = category_basket_dec$seasonal[1:1]
last_trend_value <- tail(category_basket_dec$trend[!is.na(category_basket_dec$trend)],
1)
category_basket_model_forecast = category_basket_model_forecast +
last_trend_value + seasonality
category_basket_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 262743.1
category_favored_ts <- ts(YuzTemizleyici$category_favored, start = as.Date("2021-04-28"),
end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 4432.176
category_favored_model_forecast <- predict(category_favored_model,
n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)],
1)
category_favored_model_forecast = category_favored_model_forecast +
last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 13984.14
category_brand_sold_ts <- ts(YuzTemizleyici$category_brand_sold,
start = as.Date("2021-01-28"), end = as.Date("2021-05-31"),
frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts,
type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 16680.66
category_brand_sold_model_forecast <- predict(category_brand_sold_model,
n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)],
1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast +
last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 38168.86
ty_visits_ts <- ts(YuzTemizleyici$ty_visits, start = as.Date("2021-01-28"),
end = as.Date("2021-05-31"), frequency = 7)
ty_visits_dec <- decompose(x = ty_visits_ts, type = "additive")
ty_visits_model = auto.arima(ty_visits_dec$random)
AIC(ty_visits_model)
## [1] 29881.78
ty_visits_model_forecast <- predict(ty_visits_model, n.ahead = 1)$pred
seasonality = ty_visits_dec$seasonal[1:1]
last_trend_value <- tail(ty_visits_dec$trend[!is.na(ty_visits_dec$trend)],
1)
ty_visits_model_forecast = ty_visits_model_forecast + last_trend_value +
seasonality
ty_visits_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 116744719
Residual_ts <- ts(YuzTemizleyici$Residual, start = as.Date("2021-04-28"),
end = as.Date("2021-05-31"), frequency = 7)
Residual_dec <- decompose(x = Residual_ts, type = "additive")
Residual_model = auto.arima(Residual_dec$random)
AIC(Residual_model)
## [1] 1908.616
Residual_model_forecast <- predict(Residual_model, n.ahead = 1)$pred
seasonality = Residual_dec$seasonal[1:1]
last_trend_value <- tail(Residual_dec$trend[!is.na(Residual_dec$trend)],
1)
Residual_model_forecast = Residual_model_forecast + last_trend_value +
seasonality
Residual_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 3.991736
lag_1_ts <- ts(YuzTemizleyici$lag_1, start = as.Date("2021-04-28"),
end = as.Date("2021-05-31"), frequency = 7)
lag_1_dec <- decompose(x = lag_1_ts, type = "additive")
lag_1_model = auto.arima(lag_1_dec$random)
AIC(lag_1_model)
## [1] 2053.699
lag_1_model_forecast <- predict(lag_1_model, n.ahead = 1)$pred
seasonality = lag_1_dec$seasonal[1:1]
last_trend_value <- tail(lag_1_dec$trend[!is.na(lag_1_dec$trend)],
1)
lag_1_model_forecast = lag_1_model_forecast + last_trend_value +
seasonality
lag_1_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 48.06433
We’ve forecasted the attributes until now.
##Prediction
predict(lmYuzTemizleyici, data.frame(price = price_model_forecast,
visit_count = visit_count_model_forecast, basket_count = basket_count_model_forecast,
favored_count = favored_count_model_forecast, category_sold = category_sold_model_forecast,
category_basket = category_basket_model_forecast, category_favored = category_favored_model_forecast,
category_brand_sold = category_brand_sold_model_forecast,
ty_visits = ty_visits_model_forecast, month = as.factor(6),
trend = 0, day = as.factor(3), Residual = Residual_model_forecast,
lag_1 = lag_1_model_forecast))
## 1
## 80.82597
Our prediction for YuzTemizleyici sold quantity is 80.90704
You can also embed plots, for example:
DisFircasi <- projectdata[product_content_id == 32939029]
(DisFircasi)
## event_date product_content_id price sold_count visit_count basket_count
## 1: 2021-05-31 32939029 141.2127 125 5301 608
## 2: 2021-05-30 32939029 141.8578 139 5049 604
## 3: 2021-05-29 32939029 142.1202 133 5383 615
## 4: 2021-05-28 32939029 137.6755 182 5853 739
## 5: 2021-05-27 32939029 136.0229 235 6720 911
## ---
## 368: 2020-05-29 32939029 126.1038 52 0 257
## 369: 2020-05-28 32939029 115.1035 84 0 370
## 370: 2020-05-27 32939029 114.1078 103 0 398
## 371: 2020-05-26 32939029 115.8495 101 0 411
## 372: 2020-05-25 32939029 112.9000 74 0 323
## favored_count category_sold category_visits category_basket
## 1: 698 930 41973 3911
## 2: 609 911 43021 3801
## 3: 624 895 40310 3611
## 4: 608 807 861 34608
## 5: 869 800 853 36514
## ---
## 368: 0 810 851 0
## 369: 0 927 978 0
## 370: 0 1071 1125 0
## 371: 0 1351 1419 0
## 372: 0 1193 1231 0
## category_favored category_brand_sold ty_visits
## 1: 5791 875 125439876
## 2: 4602 866 131821083
## 3: 3946 857 129670029
## 4: 3214 3370 103514886
## 5: 3377 3946 107391579
## ---
## 368: 2878 0 1
## 369: 3336 0 1
## 370: 3876 0 1
## 371: 4647 0 1
## 372: 4132 0 1
summary(DisFircasi)
## event_date product_content_id price sold_count
## Min. :2020-05-25 Min. :32939029 Min. :110.1 Min. : 0.00
## 1st Qu.:2020-08-25 1st Qu.:32939029 1st Qu.:128.9 1st Qu.: 18.00
## Median :2020-11-26 Median :32939029 Median :135.4 Median : 52.00
## Mean :2020-11-26 Mean :32939029 Mean :134.3 Mean : 92.21
## 3rd Qu.:2021-02-27 3rd Qu.:32939029 3rd Qu.:140.0 3rd Qu.:139.50
## Max. :2021-05-31 Max. :32939029 Max. :165.9 Max. :513.00
## NA's :9
## visit_count basket_count favored_count category_sold
## Min. : 0 Min. : 0.00 Min. : 0.0 Min. : 321
## 1st Qu.: 0 1st Qu.: 83.75 1st Qu.: 0.0 1st Qu.: 598
## Median : 0 Median : 216.50 Median : 0.0 Median : 806
## Mean : 2119 Mean : 392.81 Mean : 343.9 Mean :1024
## 3rd Qu.: 4097 3rd Qu.: 578.00 3rd Qu.: 572.2 3rd Qu.:1116
## Max. :15725 Max. :2249.00 Max. :2696.0 Max. :5557
##
## category_visits category_basket category_favored category_brand_sold
## Min. : 346.0 Min. : 0 Min. : 1242 Min. : 0
## 1st Qu.: 640.0 1st Qu.: 0 1st Qu.: 2426 1st Qu.: 0
## Median : 854.5 Median : 0 Median : 3300 Median : 0
## Mean : 1406.9 Mean : 19977 Mean : 4266 Mean : 3187
## 3rd Qu.: 1185.0 3rd Qu.: 44483 3rd Qu.: 5050 3rd Qu.: 5428
## Max. :43021.0 Max. :281022 Max. :44445 Max. :28944
##
## ty_visits
## Min. : 1
## 1st Qu.: 1
## Median : 1
## Mean : 39145174
## 3rd Qu.: 97619487
## Max. :178545693
##
ts.plot(DisFircasi[, c("sold_count")], main = "Daily Sales Quantity",
xlab = "Time", ylab = "Sales Quantity")
In the plot we can the increasing trend in the sales quantities. Since there is a big shift in the number of sales, we will only use the values between 1:150
DisFircasi <- DisFircasi[1:150]
ts.plot(DisFircasi[, c("sold_count")], main = "Daily Sales Quantity",
xlab = "Time", ylab = "Sales Quantity")
We are adding month, day, trend attributes to the data in order to better check.
DisFircasi[, `:=`(month, month(event_date))]
DisFircasi[, `:=`(day, lubridate::wday(event_date))]
DisFircasi[, `:=`(trend, 1:.N)]
##Regression Model
After trying lots of models, the best regression model I achived is like that with 0.8655 adjusted R squared value.
lmDisFircasi <- lm(sold_count ~ basket_count + favored_count +
category_sold + category_favored + category_brand_sold, data = DisFircasi)
summary(lmDisFircasi)
##
## Call:
## lm(formula = sold_count ~ basket_count + favored_count + category_sold +
## category_favored + category_brand_sold, data = DisFircasi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -146.90 -21.14 -5.19 18.23 104.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.789120 8.792242 2.251 0.02592 *
## basket_count 0.240600 0.016440 14.635 < 2e-16 ***
## favored_count -0.042636 0.013778 -3.094 0.00237 **
## category_sold 0.032223 0.013276 2.427 0.01645 *
## category_favored -0.002403 0.001639 -1.466 0.14479
## category_brand_sold -0.002853 0.002377 -1.201 0.23185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.81 on 144 degrees of freedom
## Multiple R-squared: 0.87, Adjusted R-squared: 0.8655
## F-statistic: 192.7 on 5 and 144 DF, p-value: < 2.2e-16
# this is the best model I can achieve.
##Decomposing
# time series and arima
tsDisFircasi <- ts(DisFircasi$sold_count, frequency = 7)
decompDisFircasi <- decompose(tsDisFircasi, type = "additive")
finalARIMAmodel3 <- arima(decompDisFircasi$random, order = c(1,
0, 5))
AIC(finalARIMAmodel3)
## [1] 1420.678
##ARIMA Models of Attributes
basket_count_ts <- ts(DisFircasi$basket_count, start = as.Date("2021-04-26"),
end = as.Date("2021-05-31"), frequency = 7)
basket_count_dec <- decompose(x = basket_count_ts, type = "additive")
basket_count_model = auto.arima(basket_count_dec$random)
AIC(basket_count_model)
## [1] 3244.35
basket_count_model_forecast <- predict(basket_count_model, n.ahead = 1)$pred
seasonality = basket_count_dec$seasonal[1:1]
last_trend_value <- tail(basket_count_dec$trend[!is.na(basket_count_dec$trend)],
1)
basket_count_model_forecast = basket_count_model_forecast + last_trend_value +
seasonality
basket_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 654.4589
favored_count_ts <- ts(DisFircasi$favored_count, start = as.Date("2021-04-26"),
end = as.Date("2021-05-31"), frequency = 7)
favored_count_dec <- decompose(x = favored_count_ts, type = "additive")
favored_count_model = auto.arima(favored_count_dec$random)
AIC(favored_count_model)
## [1] 3327.247
favored_count_model_forecast <- predict(favored_count_model,
n.ahead = 1)$pred
seasonality = favored_count_dec$seasonal[1:1]
last_trend_value <- tail(favored_count_dec$trend[!is.na(favored_count_dec$trend)],
1)
favored_count_model_forecast = favored_count_model_forecast +
last_trend_value + seasonality
favored_count_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 805.653
category_sold_ts <- ts(DisFircasi$category_sold, start = as.Date("2021-04-26"),
end = as.Date("2021-05-31"), frequency = 7)
category_sold_dec <- decompose(x = category_sold_ts, type = "additive")
category_sold_model = auto.arima(category_sold_dec$random)
AIC(category_sold_model)
## [1] 3265.814
category_sold_model_forecast <- predict(category_sold_model,
n.ahead = 1)$pred
seasonality = category_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_sold_dec$trend[!is.na(category_sold_dec$trend)],
1)
category_sold_model_forecast = category_sold_model_forecast +
last_trend_value + seasonality
category_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 776.2867
category_favored_ts <- ts(DisFircasi$category_favored, start = as.Date("2021-04-26"),
end = as.Date("2021-05-31"), frequency = 7)
category_favored_dec <- decompose(x = category_favored_ts, type = "additive")
category_favored_model = auto.arima(category_favored_dec$random)
AIC(category_favored_model)
## [1] 4461.847
category_favored_model_forecast <- predict(category_favored_model,
n.ahead = 1)$pred
seasonality = category_favored_dec$seasonal[1:1]
last_trend_value <- tail(category_favored_dec$trend[!is.na(category_favored_dec$trend)],
1)
category_favored_model_forecast = category_favored_model_forecast +
last_trend_value + seasonality
category_favored_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 4024.603
category_brand_sold_ts <- ts(DisFircasi$category_brand_sold,
start = as.Date("2021-04-26"), end = as.Date("2021-05-31"),
frequency = 7)
category_brand_sold_dec <- decompose(x = category_brand_sold_ts,
type = "additive")
category_brand_sold_model = auto.arima(category_brand_sold_dec$random)
AIC(category_brand_sold_model)
## [1] 4256.853
category_brand_sold_model_forecast <- predict(category_brand_sold_model,
n.ahead = 1)$pred
seasonality = category_brand_sold_dec$seasonal[1:1]
last_trend_value <- tail(category_brand_sold_dec$trend[!is.na(category_brand_sold_dec$trend)],
1)
category_brand_sold_model_forecast = category_brand_sold_model_forecast +
last_trend_value + seasonality
category_brand_sold_model_forecast
## Time Series:
## Start = c(18778, 2)
## End = c(18778, 2)
## Frequency = 7
## [1] 5161.364
We’ve forecasted the attributes until now.
##Prediction
predict(lmDisFircasi, data.frame(basket_count = basket_count_model_forecast,
favored_count = favored_count_model_forecast, category_sold = category_sold_model_forecast,
category_favored = category_favored_model_forecast, category_brand_sold = category_brand_sold_model_forecast))
## 1
## 143.5191